# 1. Center for World University Rankings Exploratory Analysis

### Importing Libraries

In [16]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### Importing dataset

In [17]:
# Set path variable

path = r'C:\Users\HP\Documents\CareerFoundry\Data Immersion\Achievement 6\World University Rankings Analysis'

In [18]:
# Import dataset

df_cwur = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'cwurData.csv'))

In [19]:
# Check to see if it imported correctly

df_cwur.head(10)

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012
5,6,Princeton University,USA,5,8,14,2,53,33,26,,101,82.5,2012
6,7,University of Oxford,United Kingdom,2,13,28,9,15,13,19,,26,82.34,2012
7,8,Yale University,USA,6,14,31,12,14,6,15,,66,79.14,2012
8,9,Columbia University,USA,7,23,21,10,13,12,14,,5,78.86,2012
9,10,"University of California, Berkeley",USA,8,16,52,6,6,5,3,,16,78.55,2012


In [20]:
# Check dimensions of dataframe

df_cwur.shape

(2200, 14)

### Data Cleaning

#### Dropping Irrelevant Columns

In [21]:
# Check the column names

df_cwur.columns

Index(['world_rank', 'institution', 'country', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents',
       'score', 'year'],
      dtype='object')

It looks like all the columns would be relevant for analysis. 

#### Renaming Columns

In [22]:
# Rename certain columns to be more intuitive

df_cwur.rename(columns = {'world_rank' : 'world_rank_CWUR',
                          'institution' : 'university_name',
                          'quality_of_education' : 'quality_of_education_rank',
                          'alumni_employment' : 'alumni_employment_rank',
                          'quality_of_faculty' : 'quality_of_faculty_rank',
                          'publications' : 'publications_rank',
                          'influence' : 'influence_rank',
                          'citations' : 'citations_rank',
                          'patents' : 'patents_rank',
                          'broad_impact' : 'broad_impact_rank',
                          'score' : 'overall_score_CWUR',}, inplace = True)

In [23]:
# Check column names to see if they were replaced correctly

df_cwur.columns

Index(['world_rank_CWUR', 'university_name', 'country', 'national_rank',
       'quality_of_education_rank', 'alumni_employment_rank',
       'quality_of_faculty_rank', 'publications_rank', 'influence_rank',
       'citations_rank', 'broad_impact_rank', 'patents_rank',
       'overall_score_CWUR', 'year'],
      dtype='object')

#### Changing a Variable Data Type

In [24]:
df_cwur.dtypes

world_rank_CWUR                int64
university_name               object
country                       object
national_rank                  int64
quality_of_education_rank      int64
alumni_employment_rank         int64
quality_of_faculty_rank        int64
publications_rank              int64
influence_rank                 int64
citations_rank                 int64
broad_impact_rank            float64
patents_rank                   int64
overall_score_CWUR           float64
year                           int64
dtype: object

All of the datatypes seem appropriate. 

#### Addressing Mixed Data Columns

In [25]:
for col in df_cwur.columns.tolist():
  weird = (df_cwur[[col]].applymap(type) != df_cwur[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cwur[weird]) > 0:
    print (col)

There are no mixed data columns. 

#### Addressing Missing Values

In [26]:
df_cwur.isnull().sum()

world_rank_CWUR                0
university_name                0
country                        0
national_rank                  0
quality_of_education_rank      0
alumni_employment_rank         0
quality_of_faculty_rank        0
publications_rank              0
influence_rank                 0
citations_rank                 0
broad_impact_rank            200
patents_rank                   0
overall_score_CWUR             0
year                           0
dtype: int64

There are 200 values in the 'broad_impact_rank' column that are missing. These missing values are only apparent in 2012 and 2013, and can largely be ignored. It's likely that the 'broad_impact_rank' became a factor starting in 2014. If required, the entire column can be omitted from analysis. 

#### Addressing Duplicate Values

In [27]:
df_dups = df_cwur[df_cwur.duplicated()]

In [28]:
df_dups

Unnamed: 0,world_rank_CWUR,university_name,country,national_rank,quality_of_education_rank,alumni_employment_rank,quality_of_faculty_rank,publications_rank,influence_rank,citations_rank,broad_impact_rank,patents_rank,overall_score_CWUR,year


There are no duplicate values in this dataframe.

### Summary Statistics

In [29]:
df_cwur.describe()

Unnamed: 0,world_rank_CWUR,national_rank,quality_of_education_rank,alumni_employment_rank,quality_of_faculty_rank,publications_rank,influence_rank,citations_rank,broad_impact_rank,patents_rank,overall_score_CWUR,year
count,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2000.0,2200.0,2200.0,2200.0
mean,459.590909,40.278182,275.100455,357.116818,178.888182,459.908636,459.797727,413.417273,496.6995,433.346364,47.798395,2014.318182
std,304.320363,51.74087,121.9351,186.779252,64.050885,303.760352,303.331822,264.366549,286.919755,273.996525,7.760806,0.76213
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,43.36,2012.0
25%,175.75,6.0,175.75,175.75,175.75,175.75,175.75,161.0,250.5,170.75,44.46,2014.0
50%,450.5,21.0,355.0,450.5,210.0,450.5,450.5,406.0,496.0,426.0,45.1,2014.0
75%,725.25,49.0,367.0,478.0,218.0,725.0,725.25,645.0,741.0,714.25,47.545,2015.0
max,1000.0,229.0,367.0,567.0,218.0,1000.0,991.0,812.0,1000.0,871.0,100.0,2015.0


### Exporting the cleaned file

In [30]:
df_cwur.to_csv(os.path.join(path, '02 Data','Prepared Data', 'cwur_cleaned_final.csv'))