# 2. Times Higher Education World University Rankings Exploratory Analysis

## Importing libraries

In [60]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## Importing dataset

In [61]:
# Set path variable

path = r'C:\Users\HP\Documents\CareerFoundry\Data Immersion\Achievement 6\World University Rankings Analysis'

In [62]:
# Import dataset

df_times = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'timesData.csv'))

In [63]:
# Check to see if it imported correctly

df_times.head(10)

Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25%,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27%,33 : 67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33%,37 : 63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22%,42 : 58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,-,94.2,7929,8.4,27%,45 : 55,2011
5,6,University of Cambridge,United Kingdom,90.5,77.7,94.1,94.0,57.0,91.2,18812,11.8,34%,46 : 54,2011
6,6,University of Oxford,United Kingdom,88.2,77.2,93.9,95.1,73.5,91.2,19919,11.6,34%,46 : 54,2011
7,8,"University of California, Berkeley",United States of America,84.2,39.6,99.3,97.8,-,91.1,36186,16.4,15%,50 : 50,2011
8,9,Imperial College London,United Kingdom,89.2,90.0,94.5,88.3,92.9,90.6,15060,11.7,51%,37 : 63,2011
9,10,Yale University,United States of America,92.1,59.2,89.7,91.5,-,89.5,11751,4.4,20%,50 : 50,2011


In [64]:
# Check dimensions of dataframe

df_times.shape

(2603, 14)

## Data Cleaning

### Dropping Irrelevant Columns

In [65]:
# Check the column names

df_times.columns

Index(['world_rank', 'university_name', 'country', 'teaching', 'international',
       'research', 'citations', 'income', 'total_score', 'num_students',
       'student_staff_ratio', 'international_students', 'female_male_ratio',
       'year'],
      dtype='object')

It looks like most columns would be relevant for analysis. The rankings look to be based on certain criteria: teaching, international, research, and citations. Extra additional information such as number of students, student staff ratio, and female to male ratio may lead to more insights. 

### Renaming Columns

In [66]:
# Rename certain columns to be more intuitive

df_times.rename(columns = {'world_rank' : 'world_rank_Times',
                           'teaching' : 'teaching_score',
                           'international' : 'international_outlook_score',
                           'research' : 'research_score',
                           'citations' : 'citations_score',
                           'income' : 'income_score',
                           'total_score' : 'overall_score_Times'}, inplace = True)

In [67]:
# Check column names to see if they were replaced correctly

df_times.columns

Index(['world_rank_Times', 'university_name', 'country', 'teaching_score',
       'international_outlook_score', 'research_score', 'citations_score',
       'income_score', 'overall_score_Times', 'num_students',
       'student_staff_ratio', 'international_students', 'female_male_ratio',
       'year'],
      dtype='object')

### Changing a Variable Data Type

In [68]:
df_times.dtypes

world_rank_Times                object
university_name                 object
country                         object
teaching_score                 float64
international_outlook_score     object
research_score                 float64
citations_score                float64
income_score                    object
overall_score_Times             object
num_students                    object
student_staff_ratio            float64
international_students          object
female_male_ratio               object
year                             int64
dtype: object

Certain columns have strange data types. This may be because of missing values. 

#### Working with the 'international_outlook_score' column

In [69]:
# Check for missing or strange values in the 'international' column

df_times['international_outlook_score'].value_counts(dropna = False)

29.6    10
20.7    10
46.8     9
34.3     9
-        9
        ..
40.7     1
68.8     1
22.8     1
38.9     1
14.9     1
Name: international_outlook_score, Length: 804, dtype: int64

There are 9 rows in the international_outlook_score column that have blanks ('-'). These can be replaced with NaN values. 

In [70]:
# Replace the '-' with NaN values.

df_times['international_outlook_score'] = df_times['international_outlook_score'].replace('-', np.nan)

In [71]:
# Check the number of NaN values using isnull() and sum()

df_times['international_outlook_score'].isnull().sum()

9

#### Working with the 'income_score' column

In [72]:
# Check for missing or strange values in the 'income' column

df_times['income_score'].value_counts(dropna = False)

-        218
100.0     68
28.0      26
31.1      20
28.8      19
        ... 
55.8       1
89.0       1
73.5       1
84.8       1
89.7       1
Name: income_score, Length: 613, dtype: int64

There are 218 rows in the income_score column that have blanks ('-'). These can be replaced with NaN values. 

In [73]:
# Replace the '-' with NaN values.

df_times['income_score'] = df_times['income_score'].replace('-', np.nan)

In [74]:
# Check the number of NaN values using isnull() and sum()

df_times['income_score'].isnull().sum()

218

#### Working with the 'overall_score_Times' column

In [75]:
# Check for missing or strange values in the 'overall_score' column

df_times['overall_score_Times'].value_counts(dropna = False)

-       1402
49.0      13
51.1      12
46.6      11
51.2      10
        ... 
63.8       1
87.3       1
90.2       1
90.7       1
96.1       1
Name: overall_score_Times, Length: 415, dtype: int64

There are 1402 rows in the overall_score_Times column that have blanks ('-'). These can be replaced with NaN values. 

In [76]:
# Replace the '-' with NaN values.

df_times['overall_score_Times'] = df_times['overall_score_Times'].replace('-', np.nan)

In [77]:
# Check the number of NaN values using isnull() and sum()

df_times['overall_score_Times'].isnull().sum()

1402

#### Working with the 'num_students' column

In [78]:
# Check for missing or strange values in the 'num_students' column

df_times['num_students'].value_counts(dropna = False)

NaN       59
20,152     6
23,280     6
23,144     6
17,581     6
          ..
22,422     1
6,898      1
10,546     1
10,697     1
10,117     1
Name: num_students, Length: 795, dtype: int64

In [79]:
# Check the number of NaN values using isnull() and sum()

df_times['num_students'].isnull().sum()

59

There are 59 rows in the num_students column with NaN values. 

#### Working with the 'student_staff_ratio' column

In [80]:
# Check for missing or strange values in the 'student_staff ratio' column

df_times['student_staff_ratio'].value_counts(dropna = False)

NaN     59
15.9    38
17.4    32
13.0    32
25.9    28
        ..
4.0      1
20.8     1
49.7     1
29.6     1
28.7     1
Name: student_staff_ratio, Length: 309, dtype: int64

In [81]:
# Check the number of NaN values using isnull() and sum()

df_times['student_staff_ratio'].isnull().sum()

59

There are 59 rows in the student_staff_ratio column with NaN values. 

#### Working with the 'international_students' column

In [82]:
# Check for missing or strange values in the 'international_students' column

df_times['international_students'].value_counts(dropna = False)

7%     142
10%    133
9%     130
5%     120
8%     119
12%    104
15%    103
11%    100
16%     90
1%      80
14%     80
17%     79
18%     78
4%      77
13%     76
6%      74
NaN     67
25%     67
20%     66
19%     66
3%      59
22%     56
2%      54
21%     51
23%     50
28%     48
27%     45
26%     45
33%     34
35%     30
0%      26
34%     26
38%     25
24%     24
37%     23
30%     22
29%     18
36%     18
39%     17
31%     13
32%     12
43%      9
44%      6
46%      6
40%      6
47%      6
51%      6
54%      6
48%      5
63%      2
52%      1
50%      1
82%      1
42%      1
Name: international_students, dtype: int64

In [83]:
# Check the number of NaN values using isnull() and sum()

df_times['international_students'].isnull().sum()

67

There are 67 rows in the international_students column with NaN values. 

#### Working with the 'female_male_ratio' column

In [84]:
# Check for missing or strange values in the 'female_male_ratio' column

df_times['female_male_ratio'].value_counts(dropna = False)

NaN        233
54 : 46    185
52 : 48    151
53 : 47    138
55 : 45    135
          ... 
11 : 89      1
68 : 32      1
1 : 99       1
74 : 26      1
9 : 91       1
Name: female_male_ratio, Length: 70, dtype: int64

In [85]:
# Check the number of NaN values using isnull() and sum()

df_times['female_male_ratio'].isnull().sum()

233

There are 233 rows in the student_staff_ratio column with NaN values. However, there are also 3 rows with '-' that should be replaced with NaNs to be consistent.

In [86]:
# Replace the '-' with NaN values.

df_times['female_male_ratio'] = df_times['female_male_ratio'].replace('-', np.nan)

In [87]:
# Check the number of NaN values using isnull() and sum()

df_times['female_male_ratio'].isnull().sum()

236

There are now 236 NaN values, which indicates that the 3 '-' values were changes correctly. 

### Addressing Mixed Data Columns

In [88]:
# Check for mixed data types

for col in df_times.columns.tolist():
  weird = (df_times[[col]].applymap(type) != df_times[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_times[weird]) > 0:
    print (col)

international_outlook_score
income_score
overall_score_Times
num_students
international_students
female_male_ratio


Even after replacing blanks, there are still columns that have mixed data types. After replacing the blanks with NaN's using np.Nan, these should be recognized as floats. As such, the other columns should also be changed to floats so that statistics can be calculated for them.

In [89]:
df_times.dtypes

world_rank_Times                object
university_name                 object
country                         object
teaching_score                 float64
international_outlook_score     object
research_score                 float64
citations_score                float64
income_score                    object
overall_score_Times             object
num_students                    object
student_staff_ratio            float64
international_students          object
female_male_ratio               object
year                             int64
dtype: object

In [90]:
# Change the international, income, overall_score, and num_students columns

df_times['international_outlook_score'] = df_times['international_outlook_score'].astype('float64')
df_times['income_score'] = df_times['income_score'].astype('float64')
df_times['overall_score_Times'] = df_times['overall_score_Times'].astype('float64')

In [91]:
df_times.dtypes

world_rank_Times                object
university_name                 object
country                         object
teaching_score                 float64
international_outlook_score    float64
research_score                 float64
citations_score                float64
income_score                   float64
overall_score_Times            float64
num_students                    object
student_staff_ratio            float64
international_students          object
female_male_ratio               object
year                             int64
dtype: object

#### Working with the 'num_students' column

In [92]:
# The num_students column is viewed as strings because there are both blanks and some numbers (integers).
# The best practice would be to change the entire column to a float since blanks can be turned into NaN's and the integers can just be turned into floats
# First, convert the entire column to strings

df_times['num_students'] = df_times['num_students'].astype('str')

In [93]:
# Check the datatype

df_times['num_students'].dtype

dtype('O')

In [94]:
# Create an empty column that will be filled based on values in the num_students column criteria

results_num_stud = []

for value in df_times['num_students']:
    if value == 'nan':
        results_num_stud.append(np.nan)
    else:
        results_num_stud.append(value.replace(',', ''))

df_times['num_stud_float'] = results_num_stud

df_times.head(20)

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year,num_stud_float
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25%,,2011,20152
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27%,33 : 67,2011,2243
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33%,37 : 63,2011,11074
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22%,42 : 58,2011,15596
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929,8.4,27%,45 : 55,2011,7929
5,6,University of Cambridge,United Kingdom,90.5,77.7,94.1,94.0,57.0,91.2,18812,11.8,34%,46 : 54,2011,18812
6,6,University of Oxford,United Kingdom,88.2,77.2,93.9,95.1,73.5,91.2,19919,11.6,34%,46 : 54,2011,19919
7,8,"University of California, Berkeley",United States of America,84.2,39.6,99.3,97.8,,91.1,36186,16.4,15%,50 : 50,2011,36186
8,9,Imperial College London,United Kingdom,89.2,90.0,94.5,88.3,92.9,90.6,15060,11.7,51%,37 : 63,2011,15060
9,10,Yale University,United States of America,92.1,59.2,89.7,91.5,,89.5,11751,4.4,20%,50 : 50,2011,11751


In [95]:
# Replace the original column with the cleaned up values

df_times['num_students'] = df_times['num_stud_float']

df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year,num_stud_float
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25%,,2011,20152
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27%,33 : 67,2011,2243
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33%,37 : 63,2011,11074
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22%,42 : 58,2011,15596
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929,8.4,27%,45 : 55,2011,7929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958,15.3,3%,48 : 52,2016,21958
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268,28.7,2%,36 : 64,2016,31268
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122,3.7,3%,,2016,4122
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117,12.1,8%,28 : 72,2016,10117


In [96]:
# Drop the newly created column

df_times = df_times.drop(columns = ['num_stud_float'])

df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25%,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27%,33 : 67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33%,37 : 63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22%,42 : 58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929,8.4,27%,45 : 55,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958,15.3,3%,48 : 52,2016
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268,28.7,2%,36 : 64,2016
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122,3.7,3%,,2016
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117,12.1,8%,28 : 72,2016


In [97]:
#Check the column to see if it is a float type column

df_times['num_students'].dtype

dtype('O')

In [98]:
#Convert the column to a float type column

df_times['num_students'] = df_times['num_students'].astype('float')

df_times['num_students'].dtype

dtype('float64')

#### Working with the 'international_students' column to change values from percentages to decimals

In [99]:
# For the international_students column, the majority of the values are percentages, which Python views as strings (hence the column having an object datatype)
# Need to convert these percentages to decimals to be consistent and make the column a float64 column in the end
# First, convert all the values in the column to strings

df_times['international_students'] = df_times['international_students'].astype('str')

In [100]:
df_times['international_students'].dtype

dtype('O')

In [101]:
# Create an empty column that will be filled in based on values in the international_students column criteria

results_int_stud = []

for value in df_times['international_students']:
    if value == 'nan' or value == '':
        results_int_stud.append(np.nan)
    else:
        results_int_stud.append(float(value.replace('%', '')) / 100)
        
df_times['int_stud_dec'] = results_int_stud
        
df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year,int_stud_dec
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152.0,8.9,25%,,2011,0.25
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243.0,6.9,27%,33 : 67,2011,0.27
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074.0,9.0,33%,37 : 63,2011,0.33
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596.0,7.8,22%,42 : 58,2011,0.22
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929.0,8.4,27%,45 : 55,2011,0.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958.0,15.3,3%,48 : 52,2016,0.03
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268.0,28.7,2%,36 : 64,2016,0.02
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122.0,3.7,3%,,2016,0.03
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117.0,12.1,8%,28 : 72,2016,0.08


In [102]:
# Replace the original column with the cleaned up values

df_times['international_students'] = df_times['int_stud_dec']

df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year,int_stud_dec
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152.0,8.9,0.25,,2011,0.25
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243.0,6.9,0.27,33 : 67,2011,0.27
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074.0,9.0,0.33,37 : 63,2011,0.33
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596.0,7.8,0.22,42 : 58,2011,0.22
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929.0,8.4,0.27,45 : 55,2011,0.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958.0,15.3,0.03,48 : 52,2016,0.03
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268.0,28.7,0.02,36 : 64,2016,0.02
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122.0,3.7,0.03,,2016,0.03
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117.0,12.1,0.08,28 : 72,2016,0.08


In [103]:
# Drop the newly created column

df_times = df_times.drop(columns = ['int_stud_dec'])

df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152.0,8.9,0.25,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243.0,6.9,0.27,33 : 67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074.0,9.0,0.33,37 : 63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596.0,7.8,0.22,42 : 58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929.0,8.4,0.27,45 : 55,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958.0,15.3,0.03,48 : 52,2016
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268.0,28.7,0.02,36 : 64,2016
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122.0,3.7,0.03,,2016
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117.0,12.1,0.08,28 : 72,2016


In [104]:
#Check the column to see if it is a float type column

df_times['international_students'].dtype

dtype('float64')

In [105]:
df_times.dtypes

world_rank_Times                object
university_name                 object
country                         object
teaching_score                 float64
international_outlook_score    float64
research_score                 float64
citations_score                float64
income_score                   float64
overall_score_Times            float64
num_students                   float64
student_staff_ratio            float64
international_students         float64
female_male_ratio               object
year                             int64
dtype: object

#### Working with female_male_ratio column to change everything to decimals. 

In [106]:
# Upon insepction, some of the values in the 'female_male_ratio' column are strings (the ones with ':') but some are numbers and floats (decimals and nans).
# Change all of them to strings before applying further changes. 

df_times['female_male_ratio'] = df_times['female_male_ratio'].astype('str')

In [107]:
df_times['female_male_ratio'].dtype

dtype('O')

In [108]:
# Remove all whitespace

df_times['female_male_ratio'] = df_times['female_male_ratio'].str.replace(" ", "")

df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152.0,8.9,0.25,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243.0,6.9,0.27,33:67,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074.0,9.0,0.33,37:63,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596.0,7.8,0.22,42:58,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929.0,8.4,0.27,45:55,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958.0,15.3,0.03,48:52,2016
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268.0,28.7,0.02,36:64,2016
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122.0,3.7,0.03,,2016
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117.0,12.1,0.08,28:72,2016


In [109]:
# Create an empty column that will be filled in based on values in the female_male_ratio column criteria

results_fm_ratio = []

for value in df_times['female_male_ratio']:
    if value == 'nan' or value == '':
        results_fm_ratio.append(np.nan)
    elif ':' in value:
        if '100' in value:
            if '100' in value[:3]:
                results_fm_ratio.append("All Female")
            else:
                results_fm_ratio.append("All Male")
        else:
            results_fm_ratio.append(float(int(value.split(':')[0]))/float(int(value.split(':')[1])))
    else:
        results_fm_ratio.append(value[:5])
        
df_times['ratio_dec'] = results_fm_ratio

df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year,ratio_dec
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152.0,8.9,0.25,,2011,
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243.0,6.9,0.27,33:67,2011,0.492537
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074.0,9.0,0.33,37:63,2011,0.587302
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596.0,7.8,0.22,42:58,2011,0.724138
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929.0,8.4,0.27,45:55,2011,0.818182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958.0,15.3,0.03,48:52,2016,0.923077
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268.0,28.7,0.02,36:64,2016,0.5625
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122.0,3.7,0.03,,2016,
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117.0,12.1,0.08,28:72,2016,0.388889


In [110]:
# Replace the original column with the cleaned up values

df_times['female_male_ratio'] = df_times['ratio_dec']

df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year,ratio_dec
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152.0,8.9,0.25,,2011,
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243.0,6.9,0.27,0.492537,2011,0.492537
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074.0,9.0,0.33,0.587302,2011,0.587302
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596.0,7.8,0.22,0.724138,2011,0.724138
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929.0,8.4,0.27,0.818182,2011,0.818182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958.0,15.3,0.03,0.923077,2016,0.923077
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268.0,28.7,0.02,0.5625,2016,0.5625
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122.0,3.7,0.03,,2016,
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117.0,12.1,0.08,0.388889,2016,0.388889


In [111]:
# Drop the newly created column

df_times = df_times.drop(columns = ['ratio_dec'])

df_times

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year
0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152.0,8.9,0.25,,2011
1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243.0,6.9,0.27,0.492537,2011
2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074.0,9.0,0.33,0.587302,2011
3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596.0,7.8,0.22,0.724138,2011
4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,,94.2,7929.0,8.4,0.27,0.818182,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,,21958.0,15.3,0.03,0.923077,2016
2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,,31268.0,28.7,0.02,0.5625,2016
2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,,4122.0,3.7,0.03,,2016
2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,,10117.0,12.1,0.08,0.388889,2016


In [112]:
df_times.dtypes

world_rank_Times                object
university_name                 object
country                         object
teaching_score                 float64
international_outlook_score    float64
research_score                 float64
citations_score                float64
income_score                   float64
overall_score_Times            float64
num_students                   float64
student_staff_ratio            float64
international_students         float64
female_male_ratio               object
year                             int64
dtype: object

##### Note that even after formatting this column, it will still be a mixed data type column. The reason for this is because even though most of the values are decimals, there are still some values that are strings ('All Female' or 'All Male' universities). This is fine and will have to be addressed before further analysis. 

In [113]:
# Final Check for mixed data type columns

for col in df_times.columns.tolist():
  weird = (df_times[[col]].applymap(type) != df_times[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_times[weird]) > 0:
    print (col)

female_male_ratio


As expected, the only column with mixed data types is the 'female_male_ratio' column. All other columns were addressed and formatted appropriately. 

### Addressing Missing Values

In [114]:
df_times.isnull().sum()

world_rank_Times                  0
university_name                   0
country                           0
teaching_score                    0
international_outlook_score       9
research_score                    0
citations_score                   0
income_score                    218
overall_score_Times            1402
num_students                     59
student_staff_ratio              59
international_students           67
female_male_ratio               236
year                              0
dtype: int64

There are missing values in the columns. While most of them can be ignored, it is important to note that the "overall_score_Times" column has 1402 missing values in 2603 rows. This is over 50% of the values so this column may need to be dropped later on. After looking at the universities that have these missing values, all of them had a rank that was below the top 200 so the lack of data may have been due to the rankings deeming universities outside of the top 200 unnecessary to have an overall score. 

Rows may have to be deleted that do not have an overall score, but there is still data from these universities that could be necessary so I will leave them for now. 

### Addressing Duplicate Values

In [115]:
df_dups_1 = df_times[df_times.duplicated()]

In [116]:
df_dups_1

Unnamed: 0,world_rank_Times,university_name,country,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,female_male_ratio,year


There are no duplicate values in this dataframe.

### Summary Statistics

In [117]:
df_times.describe()

Unnamed: 0,teaching_score,international_outlook_score,research_score,citations_score,income_score,overall_score_Times,num_students,student_staff_ratio,international_students,year
count,2603.0,2594.0,2603.0,2603.0,2385.0,1201.0,2544.0,2544.0,2536.0,2603.0
mean,37.801498,52.00744,35.910257,60.921629,48.979874,59.846128,23873.758648,18.445283,0.15444,2014.075682
std,17.604218,22.103825,21.254805,23.073219,21.179938,12.803446,17675.946877,11.458698,0.105915,1.685733
min,9.9,7.1,2.9,1.2,24.2,41.4,462.0,0.6,0.0,2011.0
25%,24.7,33.425,19.6,45.5,33.0,50.3,12637.75,11.975,0.08,2013.0
50%,33.9,50.3,30.5,62.5,41.0,56.0,20851.0,16.1,0.13,2014.0
75%,46.4,69.0,47.25,79.05,59.0,66.2,29991.0,21.5,0.21,2016.0
max,99.7,100.0,99.4,100.0,100.0,96.1,379231.0,162.6,0.82,2016.0


The counts for the columns are not all the same because of the missing values. Another thing to not is that the highest scores in the 'teaching' and 'research' categories were not equal to 100. 

### Exporting the cleaned file

In [118]:
df_times.to_csv(os.path.join(path, '02 Data','Prepared Data', 'times_cleaned_final.csv'))