Importing pandas

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

Loading the clean Student Performance DataFile

In [2]:
df=pd.read_csv('../clean_data.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'gender', 'race/ethnicity', 'parental level of education',
       'lunch', 'test preparation course', 'math score', 'reading score',
       'writing score', 'gender_map', 'race_ethnicity_map',
       'parental level of education_map', 'lunch_map',
       'test preparation course_map', 'total score', 'percentage', 'grades',
       'grades_map'],
      dtype='object')

# Predicting Gender

In [4]:
x = df[['math score','reading score','writing score','gender_map','race/ethinicity_map','parental_level_of_education_map','test_preparation_course_map','lunch_map','Result_map', 'grades_map']]

KeyError: "['parental_level_of_education_map', 'race/ethinicity_map', 'Result_map', 'test_preparation_course_map'] not in index"

# Identifying and handling Outliers with Z-score

In [None]:
# With Z-Score
from scipy import stats
z = np.abs(stats.zscore(x))
print(z)

In [None]:
print(np.where(z > 3))

In [None]:
df_outliers = x[(z < 3).all(axis=1)]


In [None]:
plt.figure(1)
x.plot.box()
plt.title("With outliers")
plt.xticks(rotation=90)
plt.figure(1)
df_outliers.plot.box()
plt.title("Without outliers")
plt.xticks(rotation=90)

# Training and testing after handling outliers

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
df_outliers_cleared = df_outliers[['math score','reading score','writing score','grades_map','race/ethinicity_map','parental_level_of_education_map','test_preparation_course_map','lunch_map','Result_map']]
robust_scaler = RobustScaler()
X = robust_scaler.fit_transform(df_outliers_cleared)
y = df_outliers['gender_map']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=40)

# Using Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(df_outliers_cleared, y)

In [None]:
clf.score(X_train,y_train)

In [None]:
clf.predict(X_test)

# checking the important features

In [None]:
feature=['grades_map',
       'race/ethinicity_map', 'parental_level_of_education_map', 'lunch_map',
       'test_preparation_course_map','math score','reading score','writing score','Result_map' ]

In [None]:
feature_imp=pd.DataFrame({'feature':feature,'importance':clf.feature_importances_})
feature_imp

In [None]:
feature_imp.plot.bar()

# Removing less imp features from the feature

In [None]:
df_new = df_outliers_cleared.drop(['reading score','math score','writing score','Result_map'], axis=1)

# Training and Testing after removing less imp features

In [None]:
robust_scaler2 = RobustScaler()
X = robust_scaler2.fit_transform(df_new)
y = df_outliers['gender_map']
X_train2,X_test2,y_train2,y_test2 = train_test_split(X,y,test_size=0.2,random_state=100)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression(n_jobs=-1, random_state=10)

In [None]:
logistic_regression.fit(X_train2,y_train2)

In [None]:
prediction = logistic_regression.predict(X_test2)

In [None]:
from sklearn.metrics import precision_score,recall_score, confusion_matrix,accuracy_score, f1_score

In [None]:
print ('Accuracy:', accuracy_score(y_test2,prediction ))
print ('F1 score:', f1_score(y_test2, prediction,average='micro'))
print ('Precision:', precision_score(y_test2, prediction,average='micro'))
print ('Recall:', recall_score(y_test2, prediction,average='micro'))
print ('\n confussion matrix:\n',confusion_matrix(y_test2, prediction))

# Classification Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(min_samples_split=10,min_samples_leaf=10,random_state=10)
clf.fit(X_train2,y_train2)

In [None]:
prediction = clf.predict(X_test2)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [None]:
print ('Accuracy:', accuracy_score(y_test2,prediction ))

In [None]:
print ('\n confussion matrix:\n',confusion_matrix(y_test2, prediction))

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train2,y_train2)

In [None]:
model.score(X_train2,y_train2)

In [None]:
model.score(X_test2,y_test2)

In [None]:
prediction=model.predict(X_test2)
prediction

In [None]:
plt.scatter(y_test2, prediction)
plt.xlabel("True Values")
plt.ylabel("Predictions")

# KNN

In [None]:
# Training data
x3 = df[['grades_map',
       'race/ethinicity_map', 'parental_level_of_education_map',
       'test_preparation_course_map','reading score','writing score']]
# target values 
y3 = df['gender_map']

X_train3, X_test3, y_train3, y_test3 = train_test_split(x3, y3, test_size=0.20, random_state=40)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train3, y_train3) 

In [None]:
neigh.score(X_test3,y_test3)

In [None]:
predict = neigh.predict(X_test3)
predict

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test3, predict)
accuracy

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test3,predict))