In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [2]:
date_columns = ['Internship_deadline', 'Earliest_Start_Date', 'Start Date', 
                'End Date', 'Start_Date']

train_merged = pd.read_csv('../data/train_merged.csv')
test_merged = pd.read_csv('../data/test_merged.csv')

In [None]:
data_dictionary = pd.read_excel('../data/Data_Dictionary.xlsx'); data_dictionary.head()

## Training dataset

In [None]:
train_merged.head()

## Test set

In [None]:
test_merged.head()

## Class Distribution

In [None]:
train_merged.Is_Shortlisted.value_counts()

## Exploratory Data Analysis

In [None]:
train_merged.loc[:, 'Earliest_Start_Date_year'] = train_merged.Earliest_Start_Date.dt.year
train_merged.loc[:, 'Earliest_Start_Date_month'] = train_merged.Earliest_Start_Date.dt.month
train_merged.loc[:, 'Earliest_Start_Date_day'] = train_merged.Earliest_Start_Date.dt.day

test_merged.loc[:, 'Earliest_Start_Date_year'] = test_merged.Earliest_Start_Date.dt.year
test_merged.loc[:, 'Earliest_Start_Date_month'] = test_merged.Earliest_Start_Date.dt.month
test_merged.loc[:, 'Earliest_Start_Date_day'] = test_merged.Earliest_Start_Date.dt.day

train_merged.loc[:, 'Internship_deadline_year'] = train_merged.Internship_deadline.dt.year
train_merged.loc[:, 'Internship_deadline_month'] = train_merged.Internship_deadline.dt.month
train_merged.loc[:, 'Internship_deadline_day'] = train_merged.Internship_deadline.dt.day

test_merged.loc[:, 'Internship_deadline_year'] = test_merged.Internship_deadline.dt.year
test_merged.loc[:, 'Internship_deadline_year'] = test_merged.Internship_deadline.dt.month
test_merged.loc[:, 'Internship_deadline_year'] = test_merged.Internship_deadline.dt.day

In [None]:
train_merged.loc[:, 'date_diff'] = ((train_merged.Internship_deadline - train_merged.Earliest_Start_Date) / np.timedelta64(1, 'D')).astype(int)

In [None]:
# Relationship between difference between internship deadline vs whether you are shorlisted or not

sns.FacetGrid(train_merged, hue="Is_Shortlisted", size=5) \
   .map(plt.hist, "date_diff") \
   .add_legend()

In [None]:
## Relationship between expected stipend and Is_Shortlisted

train_merged.loc[:, ['Stipend_Type', 'Is_Shortlisted']].head()

In [None]:
train_merged.Expected_Stipend.value_counts()

In [None]:
def salary_mapping(salary):
    if salary < 2000:
        return 'No Expectations'
    elif salary >= 2000 and salary < 5000:
        return '2-5K'
    elif salary >= 5000 and salary < 10000:
        return '5-10K'
    else:
        return '10K+'

In [44]:
train_merged.Stipend1 = train_merged.Stipend1.fillna(train_merged.Stipend1.mean())

In [None]:
train_merged.loc[:, 'Stipend_level'] = train_merged.Stipend1.map(salary_mapping)

In [None]:
def check_if_expectations_match(row):
    expected_stipend = row['Expected_Stipend']
    stipend_level = row['Stipend_level']
    
    if expected_stipend == 'No Expectations':
        return 1
    elif expected_stipend == '2-5K':
        if stipend_level in ['2-5K', '5-10K', '10K+']:
            return 1
        else:
            return 0
    elif expected_stipend == '5-10K':
        if stipend_level in ['5-10K', '10K+']:
            return 1
        else:
            return 0
    elif stipend_level == '10K+':
        if stipend_level == '10K+':
            return 1
        else:
            return 0

train_merged.loc[:, 'expectations_match'] = train_merged[['Expected_Stipend', 'Stipend_level']].apply(check_if_expectations_match, axis=1)

In [None]:
train_merged.groupby(['expectations_match', 'Is_Shortlisted']).size()

** This is a potential feature **

In [None]:
train_merged.loc[:, 'normalized_stipend_1'] = np.log10(train_merged.Stipend1 + 1)

In [None]:
sns.FacetGrid(train_merged, hue="Is_Shortlisted", size=5) \
   .map(plt.hist, "normalized_stipend_1") \
   .add_legend()

In [None]:
def check_if_locations_match(row):
    internship_location = row['Internship_Location']
    student_location = row['Location']
    
    return int(internship_location == student_location)

In [None]:
train_merged.loc[:, 'location_match'] = train_merged[['Internship_Location', 'Location']].apply(check_if_locations_match, axis=1)

In [None]:
train_merged.groupby(['location_match', 'Is_Shortlisted']).size()

In [None]:
train_merged.loc[:, ['Internship_Location', 'Is_Shortlisted']].head()

In [None]:
print 'Number of levels of Internship location ', len(train_merged.Internship_Location.unique())

In [None]:
len(train_merged.Location.unique())

In [None]:
train_merged.Internship_Location.value_counts()

In [None]:
train_merged.groupby(['Internship_Location', 'Is_Shortlisted']).size()

In [None]:
train_merged.Skills_required.unique()

In [26]:
train_merged.select_dtypes(include=['object']).columns

Index([u'Earliest_Start_Date', u'Expected_Stipend', u'Preferred_location',
       u'Internship_Profile', u'Skills_required', u'Internship_Type',
       u'Internship_Location', u'Internship_category', u'Stipend_Type',
       u'Internship_deadline', u'Start_Date', u'Institute_Category',
       u'Institute_location', u'hometown', u'Degree', u'Stream',
       u'Current_year', u'Experience_Type', u'Profile', u'Location',
       u'Start Date', u'End Date'],
      dtype='object')

## Pivot Tables

In [52]:
features = ['Expected_Stipend', 'Stipend_Type', 'Is_Shortlisted', 'Internship_Location', 'Experience_Type',
            'Institute_Category', 'Stipend1', 'Degree']

In [48]:

train_merged[features].pivot_table(index=['Expected_Stipend', 'Stipend_Type'],
                                   columns=['Experience_Type'],
                                   values=['Is_Shortlisted'],
                                   aggfunc='count', margins=True, fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Is_Shortlisted,Is_Shortlisted,Is_Shortlisted,Is_Shortlisted,Is_Shortlisted,Is_Shortlisted,Is_Shortlisted,Is_Shortlisted,Is_Shortlisted,Is_Shortlisted
Unnamed: 0_level_1,Experience_Type,academic_project,award,internship,job,other,participation,por,training,workshop,All
Expected_Stipend,Stipend_Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
10K+,fixed,3251,400,3284,1822,226,238,423,2075,332,17120
10K+,performance,97,35,265,143,18,43,50,150,26,1206
10K+,unpaid,7,5,13,8,2,0,0,1,2,65
10K+,variable,3466,469,4073,2389,222,289,505,2366,464,20006
2-5K,fixed,2483,627,3736,969,278,688,894,1950,734,18140
2-5K,performance,312,184,1110,261,114,199,284,379,192,4437
2-5K,unpaid,27,8,49,12,5,17,12,17,7,243
2-5K,variable,2485,764,3938,1007,308,794,1057,1981,806,19185
5-10K,fixed,4215,685,5183,2082,324,586,868,2862,784,25993
5-10K,performance,239,80,709,225,46,76,123,274,100,2683


In [53]:
train_merged[features].pivot_table(index=['Is_Shortlisted'],
                                   columns=['Stipend_Type', 'Institute_Category', 'Degree'],
                                   values=['Stipend1'],
                                   aggfunc='mean', margins=True, fill_value=0)

Unnamed: 0_level_0,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1,Stipend1
Stipend_Type,fixed,fixed,fixed,fixed,fixed,fixed,fixed,fixed,fixed,fixed,...,variable,variable,variable,variable,variable,variable,variable,variable,variable,All
Institute_Category,N,N,N,N,N,N,N,N,N,N,...,Y,Y,Y,Y,Y,Y,Y,Y,Y,Unnamed: 21_level_2
Degree,10th,12th,A Levels,AAIP,AISSE,AMIETE(Equivalent To B.Tech),APEARING B.TECH,Aa,Acca,Ace Engineering College,...,Post Graduation,UG,UGD,Under Graduate,Under Graduate First Year,Under Graduation,Undergrad,Undergraduate,Undergraguation,Unnamed: 21_level_3
Is_Shortlisted,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
0,7500,6850.931677,7333.333333,10000,15000,5000,6500,3000,7000.0,7000,...,3500,3727.272727,1000,6067.567568,1500,5500,0,4504.0,5000,6966.626964
1,0,5615.384615,0.0,0,0,0,0,0,3000.0,0,...,0,2000.0,0,2333.333333,1250,0,5000,3166.666667,0,5660.599743
All,7500,6643.410853,7333.333333,10000,15000,5000,6500,3000,5666.666667,7000,...,3500,3583.333333,1000,5787.5,1375,5500,5000,4245.16129,5000,6799.940342


In [29]:
train_merged.groupby(['Experience_Type', 'Is_Shortlisted']).size()

Experience_Type   Is_Shortlisted
academic_project  0                 25567
                  1                  3567
award             0                  5177
                  1                   856
internship        0                 29928
                  1                  5758
job               0                 12215
                  1                  1810
other             0                  2078
                  1                   409
participation     0                  4647
                  1                   766
por               0                  6166
                  1                  1241
training          0                 18830
                  1                  2339
workshop          0                  6220
                  1                   806
dtype: int64