In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv('campus.csv') # use the correct path to load your csv file into df.
df.drop('Timestamp', axis='columns', inplace=True) # Got rid of time stamp
print(df.shape)
df.head()

(227, 15)


Unnamed: 0,What is your current year of study,What is your current relationship status?,How do you prefer studying before exams?,Do you do competitive programming?,Which hostel do you live in?,What is your gender?,What is your favourite sport?,Favourite hangout place in campus,Which state/union territory do you belong to?,What is your dept?,What is your preferred coding language?,What is your favourite place to eat at IIT Guwahati?,Which is your favourite club in IIT Guwahati?,Are you satisfied in IIT Guwahati,What is your current cpi?
0,2nd,In a relationship,Pulling of all nighters :),Yes,,Man,Table tennis,IIT G lake (in front of library),Haryana,CSE,C++,Rolls Mania/Tacos,Octaves,No,9.86
1,3rd,Single,Studying on a regular basis,No,Manas,Man,Football,IIT G lake (in front of library),Uttar Pradesh,CST,Python,Hostel Canteen,ml.ai,No,1.0
2,2nd,Single,Studying on a regular basis,No,Disang,Woman,Badminton,IIT G lake (in front of library),West Bengal,ECE,C,Hostel Mess,Robotics,Maybe,9.3
3,2nd,Single,Pulling of all nighters :),No,,Man,Football,IIT G lake (in front of library),The Government of NCT of Delhi,,Python,Food Court,Coding Club,Maybe,7.5
4,,Single,Studying on a regular basis,No,Lohit,Man,Cricket,IIT G lake (in front of library),Uttar Pradesh,,Java,Hostel Canteen,ml.ai,Yes,8.83


## Making DataFrame easier to work with and Cleaning data
There are some columns where inputs were taken as a short text answer format, obviously these columns have a lot of useless or incompatible entries

CPI column used short text answer format

In [3]:
# Renaming columns with shorter names
df.rename(columns = {'What is your current year of study':'year', 
                     'What is your current relationship status?': 'relationship',
                     'How do you prefer studying before exams?': 'study',
                     'Do you do competitive programming?': 'CP',
                     'Which hostel do you live in?': 'hostel',
                     'What is your gender?': 'gender',
                     'What is your favourite sport?': 'sport',
                     'Favourite hangout place in campus': 'hangout_spot',
                     'Which state/union territory do you belong to?': 'state',
                     'What is your dept?': 'dept',
                     'What is your preferred coding language?': 'coding_lang',
                     'What is your favourite place to eat at IIT Guwahati?': 'fav_place_to_eat',
                     'Which is your favourite club in IIT Guwahati?': 'fav_club',
                     'Are you satisfied in IIT Guwahati': 'satisfied',
                     'What is your current cpi?': 'cpi'}, inplace = True)

df.head()

Unnamed: 0,year,relationship,study,CP,hostel,gender,sport,hangout_spot,state,dept,coding_lang,fav_place_to_eat,fav_club,satisfied,cpi
0,2nd,In a relationship,Pulling of all nighters :),Yes,,Man,Table tennis,IIT G lake (in front of library),Haryana,CSE,C++,Rolls Mania/Tacos,Octaves,No,9.86
1,3rd,Single,Studying on a regular basis,No,Manas,Man,Football,IIT G lake (in front of library),Uttar Pradesh,CST,Python,Hostel Canteen,ml.ai,No,1.0
2,2nd,Single,Studying on a regular basis,No,Disang,Woman,Badminton,IIT G lake (in front of library),West Bengal,ECE,C,Hostel Mess,Robotics,Maybe,9.3
3,2nd,Single,Pulling of all nighters :),No,,Man,Football,IIT G lake (in front of library),The Government of NCT of Delhi,,Python,Food Court,Coding Club,Maybe,7.5
4,,Single,Studying on a regular basis,No,Lohit,Man,Cricket,IIT G lake (in front of library),Uttar Pradesh,,Java,Hostel Canteen,ml.ai,Yes,8.83


#### Simplifying/shortening some entries in data

In [4]:
# Renaming some entries in relationship column
print('Unique values in relationship column:')
print(df['relationship'].unique())
print()

df['relationship'].replace('In a relationship', 'relationship', inplace=True)
df['relationship'].replace("It's complicated", 'complicated', inplace=True)

print('Unique values in relationship column after replacements:')
print(df['relationship'].unique())

Unique values in relationship column:
['In a relationship' 'Single' "It's complicated" nan]

Unique values in relationship column after replacements:
['relationship' 'Single' 'complicated' nan]


In [5]:
# Renaming some entries in study column
print('Unique values in study column:')
print(df['study'].unique())
print()

df['study'].replace('Pulling of all nighters :)', 'all_nighters', inplace=True)
df['study'].replace('Studying on a regular basis', 'regular_study', inplace=True)

print('Unique values in study column after replacements:')
print(df['study'].unique())

Unique values in study column:
['Pulling of all nighters :)' 'Studying on a regular basis']

Unique values in study column after replacements:
['all_nighters' 'regular_study']


In [6]:
# Renaming some entries in hangout spot column
print('Unique values in hangout spot column:')
print(df['hangout_spot'].unique())
print()

df['hangout_spot'].replace('IIT G lake (in front of library)', 'IITG_Lake', inplace=True)
df['hangout_spot'].replace('Serpentine Lake 😏', 'serpentine;)', inplace=True)
df['hangout_spot'].replace('Lake in front of the hospital', 'Tihor_lake', inplace=True)
df['hangout_spot'] = df['hangout_spot'].str.replace(' ', '_')

print('Unique values in hangout spot column after replacements:')
print(df['hangout_spot'].unique())

Unique values in hangout spot column:
['IIT G lake (in front of library)' 'Food Court' 'Hostel Canteen' 'Khoka'
 'Serpentine Lake 😏' nan 'New Sac' 'Old Sac' 'Playgrounds'
 'Lake in front of the hospital' 'Market Complex' 'Others']

Unique values in hangout spot column after replacements:
['IITG_Lake' 'Food_Court' 'Hostel_Canteen' 'Khoka' 'serpentine;)' nan
 'New_Sac' 'Old_Sac' 'Playgrounds' 'Tihor_lake' 'Market_Complex' 'Others']


In [7]:
# Some other weird things shortened
df['state'].replace('The Government of NCT of Delhi', 'Delhi', inplace=True)
df['fav_place_to_eat'].replace('None', np.nan, inplace=True)
df['fav_club'].replace('ml.ai', 'iitg.ai', inplace=True)
df['hostel'].replace('Bhramaputra', 'Brahmaputra', inplace=True)

In [8]:
# All non-numeric answers of cpi converted to float
def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
for i in df.index:
    if isfloat(df.loc[i, 'cpi']):
        df.loc[i, 'cpi'] = float(df.loc[i, 'cpi'])
    else:
        df.loc[i, 'cpi'] = np.nan
        
        # Values of cpi outside range 4-10 is considered fake
    if (df.loc[i, 'cpi'] >= 10) | (df.loc[i, 'cpi'] <= 4):
        df.loc[i, 'cpi'] = np.nan

In [9]:
df['cpi'] = df['cpi'].astype('float64')
df['cpi'].isna().sum()

21