In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **SETUP**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Reading dataset and storing in DataFrame(data)**

In [3]:
data = pd.read_csv("/kaggle/input/best-universities-in-the-united-kingdom/uk_universities.csv")

In [4]:
data.head()

**Getting Shape**

In [6]:
print("Records : ",data.shape[0],"Features : ",data.shape[1])

In [7]:
data.info()

# **Data Cleaning**

**Checking for NULL Values**

In [15]:
data.isna().any()

In [16]:
data.isna().sum()

In [20]:
missing_values = pd.DataFrame({'Missing_Values_Count':data.isna().sum(),"Percentage":data.isna().sum()/len(data) * 100})
missing_values.style.background_gradient(cmap='hot')

**Filling Missing Values**

**Replacing Motto Feature will NULL(str)**

In [22]:
data['Motto'].replace(np.NaN,"NULL",inplace=True)

**Filling Region feature**

In [27]:
grp_data = data.groupby(['Region'])

**Filling Academic_Calender Feature using groupby of Region Feature**

In [35]:
data['Academic_Calender'] = grp_data['Academic_Calender'].transform(lambda x : x.fillna(method='ffill'))

In [37]:
data['Academic_Calender'].value_counts()

**Filling Campus_Setting_Feature**

In [38]:
data['Campus_setting'] = data['Campus_setting'].fillna(method='ffill')

In [40]:
data['Campus_setting'].value_counts()

**Filling CWUR_score Feature using interpolate**

In [46]:
plt.hist(data['CWUR_score'],bins=20)
plt.axvline(data['CWUR_score'].mean(),color='g')
plt.axvline(data['CWUR_score'].median(),color='r')

In [41]:
data['CWUR_score'] = data['CWUR_score'].interpolate()

In [44]:
data['CWUR_score']

**After Data Cleaning**

In [45]:
sns.heatmap(data.isna())

# **Checking For Outliers**

**Outlier Detection Using IQR**

**BoxPlot for numerical feature**

In [55]:
col = data.select_dtypes(exclude='object')

In [59]:
for i in col:
    print("Feature : ",i)
    plt.boxplot(data[i])
    plt.show()

In [61]:
col = data[['Longitude','Latitude','Estimated_cost_of_living_per_year_(in_pounds)','PG_average_fees_(in_pounds)','UG_average_fees_(in_pounds)','CWUR_score','World_rank','UK_rank','Founded_year']]

In [62]:
def IQR(data,col):
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    return iqr,q1,q3

In [64]:
for i in col:
    iqr,q1,q3 = IQR(data,i)
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    data = data[(data[i] > lower) & (data[i] < upper)]

In [87]:
for i in col:
    print("Feature : ",i)
    plt.boxplot(data[i])
    plt.show()

**Outlier Deduction Done**

# **EDA**

In [79]:
data.info()

In [118]:
p_table = pd.pivot_table(data,values=['UG_average_fees_(in_pounds)','PG_average_fees_(in_pounds)','Minimum_IELTS_score'],columns='Campus_setting',aggfunc=(np.median))
p_table

**Feature Scaling**

**Normalization Method**

In [119]:
data['UG_average_fees_(in_pounds)'].plot(kind='box')

In [123]:
data['UG_Fess'] = (data['UG_average_fees_(in_pounds)'] - min(data['UG_average_fees_(in_pounds)'])) / (max(data['UG_average_fees_(in_pounds)']) - min(data['UG_average_fees_(in_pounds)']))

In [126]:
data['UG_Fess']

In [128]:
data['UG_Fess'].plot(kind='box')

In [131]:
data['PG_Fess'] = (data['PG_average_fees_(in_pounds)'] - min(data['PG_average_fees_(in_pounds)'])) / (max(data['PG_average_fees_(in_pounds)']) - min(data['PG_average_fees_(in_pounds)']))

In [132]:
sns.kdeplot(data['PG_Fess'],fill=True)

In [133]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(25,7))
sns.kdeplot(data['UG_Fess'],fill=True,ax=ax1)
sns.kdeplot(data['PG_Fess'],fill=True,ax=ax2)

**Standard_scaller**

In [134]:
data['Std_UG'] = (data['UG_average_fees_(in_pounds)']-np.mean(data['UG_average_fees_(in_pounds)'])) / np.std(data['UG_average_fees_(in_pounds)'])

In [135]:
data['Std_UG']

In [136]:
data['Std_PG'] = (data['PG_average_fees_(in_pounds)']-np.mean(data['PG_average_fees_(in_pounds)'])) / np.std(data['PG_average_fees_(in_pounds)'])

In [137]:
data['Std_PG']

In [138]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(25,7))
sns.kdeplot(data['Std_PG'],fill=True,ax=ax1)
sns.kdeplot(data['Std_UG'],fill=True,ax=ax2)