In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
survey_data = pd.read_csv('data_cleaned.csv')
print(survey_data.shape)

survey_data.head()

(792, 14)


Unnamed: 0,gender,mother_tongue,mother_tongue_rating,no_of_fluent_languages,educational_qualification,state,resided_in_the_same_city_from_birth,teaching_language_preference,notes_language_preference,questionpaper_language_preference,learning_in_local_language,exam_in_regional_language,teaching_in_regional_language,knowledge_improvement
0,Female,Tamil,Excellent,5.0,Undergraduate,Karnataka,Yes,Regional Language,Regional Language,Regional Language,Yes,Yes,Yes,Yes
1,Male,Telugu,Excellent,2.0,Undergraduate,Andhra pradesh,Yes,English Language,English Language,English Language,Maybe,No,Yes,Yes
2,Female,Hindi,Good,1.0,Undergraduate,Uttar pradesh,No,Hybrid Language,English Language,Hybrid Language,Maybe,No,Maybe,No
3,Male,Telugu,Excellent,3.0,Undergraduate,Telangana,Yes,Hybrid Language,English Language,English Language,Yes,No,Yes,No
4,Male,Kannada,Excellent,3.0,After undergraduation,Karnataka,Yes,Hybrid Language,English Language,English Language,Yes,Yes,Yes,Yes


In [3]:
null_features = [feature for feature in survey_data.columns if survey_data[feature].isnull().sum() > 0]

survey_data[null_features].isnull().sum().sort_values(ascending = False) * 100 / len(survey_data)

learning_in_local_language           6.439394
no_of_fluent_languages               0.378788
notes_language_preference            0.252525
gender                               0.126263
teaching_language_preference         0.126263
questionpaper_language_preference    0.126263
teaching_in_regional_language        0.126263
dtype: float64

# The Variable Types

In [4]:
numerical_features = [feature for feature in survey_data.columns if survey_data[feature].dtype != 'O']
print("Total number of numerical features: ", len(numerical_features))
print(numerical_features)

Total number of numerical features:  1
['no_of_fluent_languages']


In [5]:
categorical_features = [feature for feature in survey_data.columns if feature not in numerical_features and feature != 'knowledge_improvement']
print("Total number of numerical features: ", len(categorical_features))
print(categorical_features)

Total number of numerical features:  12
['gender', 'mother_tongue', 'mother_tongue_rating', 'educational_qualification', 'state', 'resided_in_the_same_city_from_birth', 'teaching_language_preference', 'notes_language_preference', 'questionpaper_language_preference', 'learning_in_local_language', 'exam_in_regional_language', 'teaching_in_regional_language']


# Splitting train and test data

In [6]:
# 1. Removing null rows
survey_data = survey_data.dropna()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(survey_data.drop(['knowledge_improvement'], axis = 1),
                                                                    survey_data['knowledge_improvement'],
                                                                    test_size = 0.3, random_state = 12)

# Handling Missing Values

# Categorical Encoding

In [8]:
print(survey_data.shape)
survey_data.head()

(734, 14)


Unnamed: 0,gender,mother_tongue,mother_tongue_rating,no_of_fluent_languages,educational_qualification,state,resided_in_the_same_city_from_birth,teaching_language_preference,notes_language_preference,questionpaper_language_preference,learning_in_local_language,exam_in_regional_language,teaching_in_regional_language,knowledge_improvement
0,Female,Tamil,Excellent,5.0,Undergraduate,Karnataka,Yes,Regional Language,Regional Language,Regional Language,Yes,Yes,Yes,Yes
1,Male,Telugu,Excellent,2.0,Undergraduate,Andhra pradesh,Yes,English Language,English Language,English Language,Maybe,No,Yes,Yes
2,Female,Hindi,Good,1.0,Undergraduate,Uttar pradesh,No,Hybrid Language,English Language,Hybrid Language,Maybe,No,Maybe,No
3,Male,Telugu,Excellent,3.0,Undergraduate,Telangana,Yes,Hybrid Language,English Language,English Language,Yes,No,Yes,No
4,Male,Kannada,Excellent,3.0,After undergraduation,Karnataka,Yes,Hybrid Language,English Language,English Language,Yes,Yes,Yes,Yes


## Top 3 Mother tongue language

In [9]:
mother_tongue_encoder = OneHotEncoder(top_categories = 4, variables = ['mother_tongue'])
mother_tongue_encoder.fit(X_train)

X_train = mother_tongue_encoder.transform(X_train)
X_test = mother_tongue_encoder.transform(X_test)

In [10]:
categorical_features.remove('mother_tongue')

encoder = OneHotEncoder(variables = categorical_features)
encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [11]:
X_train.head()

Unnamed: 0,no_of_fluent_languages,mother_tongue_Telugu,mother_tongue_Kannada,mother_tongue_Tamil,mother_tongue_Hindi,gender_Male,gender_Female,gender_Prefer not to say,mother_tongue_rating_Excellent,mother_tongue_rating_Very Good,...,questionpaper_language_preference_Regional Language,learning_in_local_language_Yes,learning_in_local_language_No,learning_in_local_language_Maybe,exam_in_regional_language_No,exam_in_regional_language_Yes,exam_in_regional_language_Maybe,teaching_in_regional_language_No,teaching_in_regional_language_Yes,teaching_in_regional_language_Maybe
630,4.0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
545,4.0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0
735,4.0,0,1,0,0,1,0,0,0,1,...,0,1,0,0,0,0,1,0,0,1
459,5.0,0,1,0,0,1,0,0,1,0,...,1,1,0,0,0,1,0,0,1,0
522,2.0,1,0,0,0,1,0,0,1,0,...,0,0,0,1,1,0,0,1,0,0


# Feature Scaling

In [12]:
scaler = StandardScaler()
scaler.fit(X_train[numerical_features])

X_train[numerical_features] = scaler.transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [13]:
X_train.head()

Unnamed: 0,no_of_fluent_languages,mother_tongue_Telugu,mother_tongue_Kannada,mother_tongue_Tamil,mother_tongue_Hindi,gender_Male,gender_Female,gender_Prefer not to say,mother_tongue_rating_Excellent,mother_tongue_rating_Very Good,...,questionpaper_language_preference_Regional Language,learning_in_local_language_Yes,learning_in_local_language_No,learning_in_local_language_Maybe,exam_in_regional_language_No,exam_in_regional_language_Yes,exam_in_regional_language_Maybe,teaching_in_regional_language_No,teaching_in_regional_language_Yes,teaching_in_regional_language_Maybe
630,0.79744,0,0,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
545,0.79744,0,1,0,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0
735,0.79744,0,1,0,0,1,0,0,0,1,...,0,1,0,0,0,0,1,0,0,1
459,1.680996,0,1,0,0,1,0,0,1,0,...,1,1,0,0,0,1,0,0,1,0
522,-0.969673,1,0,0,0,1,0,0,1,0,...,0,0,0,1,1,0,0,1,0,0


# Saving our data

In [14]:
# Saving X_train and X_test
X_train.to_csv("X_train.csv", index = False)
X_test.to_csv("X_test.csv", index = False)

In [15]:
def encode_target(value):
    encode_dict = {'Yes': 1, 'No': 0, 'Maybe': 2}
    
    return encode_dict[value]

In [16]:
y_train = y_train.apply(encode_target)
y_test = y_test.apply(encode_target)

In [17]:
# Saving y_train and y_test
y_train.to_csv("y_train.csv", index = False)
y_test.to_csv("y_test.csv", index = False)