In [1]:
import pandas as pd
import numpy as np
from ipywidgets import interact
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
df = pd.read_csv("Data_Science_2020_v2.csv")

In [3]:
df.columns

Index(['Application_ID', 'Current City', 'Python (out of 3)',
       'R Programming (out of 3)', 'Data Science (out of 3)', 'Other skills',
       'Institute', 'Degree', 'Stream', 'Current Year Of Graduation',
       'Performance_PG', 'Performance_UG', 'Performance_12', 'Performance_10'],
      dtype='object')

In [4]:
oskills = {'Machine Learning',' Machine Learning','Deep Learning',' Deep Learning','Natural Language Processing (NLP)',' Natural Language Processing (NLP)','Amazon Web Services (AWS)',' Amazon Web Services (AWS)','Statistical Modeling',' Statistical Modeling','MS-Excel',' MS-Excel'}
ugyear = {2020:10,2019:8}
pgyear = {2020:7}
rating = {3:10,2:7,1:3}
totalscore = []
skillsown = []
for py,r,ds,osk,year,per in zip(df['Python (out of 3)'],df['R Programming (out of 3)'],df['Data Science (out of 3)'],df['Other skills'],df['Current Year Of Graduation'],df['Performance_PG']):
    score = 0
    skillscore = 0
#     if per == 'missing':
    score += ugyear.get(year,5)
#     else:
#         score += pgyear.get(year,3)
    score += rating.get(py,0)
    score += rating.get(r,0)
    score += rating.get(ds,0)
    try:
        ownskill = osk.split(",")
    except:
        ownskill = "none"        
    score += len(set(ownskill).intersection(oskills))*3
    skillscore += len(set(ownskill).intersection(oskills))
    if "SQL" in str(osk) or "DB" in str(osk):
        score += 3
        skillscore += 1
    #print(score)
    #print(skillscore*3)
    totalscore.append(score)
    skillsown.append(skillscore)

#totalscore
df['skillsown']=[i*3 for i in skillsown ]
df['totalscore'] = totalscore
df['Approval'] = ['Yes' if i >= 40 else 'No' for i in totalscore]

In [5]:
df1 = df.head(20)

In [6]:
df.Approval.value_counts()

No     567
Yes     44
Name: Approval, dtype: int64

## So out of all participants only 40 are approved for the next round 

In [7]:
df.to_csv('final.csv')

PermissionError: [Errno 13] Permission denied: 'final.csv'

# Model Building

In [8]:
data = pd.read_csv('final.csv')

In [9]:
df_s = data[['Python (out of 3)','R Programming (out of 3)','Data Science (out of 3)','Current Year Of Graduation','skillsown','Approval']]

In [10]:
df_s.head()

Unnamed: 0,Python (out of 3),R Programming (out of 3),Data Science (out of 3),Current Year Of Graduation,skillsown,Approval
0,1,0,3,2019,6,No
1,2,1,2,2020,9,No
2,2,0,0,2018,0,No
3,2,0,2,2021,12,No
4,2,0,0,2018,6,No


In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

## Data Pre processing

In [12]:
df_s.columns[:]

Index(['Python (out of 3)', 'R Programming (out of 3)',
       'Data Science (out of 3)', 'Current Year Of Graduation', 'skillsown',
       'Approval'],
      dtype='object')

In [13]:
df_dummy = pd.get_dummies(df_s, columns=df_s.columns[:-1])

In [14]:
df_dummy['Approval'] = df_s.Approval

In [15]:
df_dummy.head()

Unnamed: 0,Approval,Python (out of 3)_0,Python (out of 3)_1,Python (out of 3)_2,Python (out of 3)_3,R Programming (out of 3)_0,R Programming (out of 3)_1,R Programming (out of 3)_2,R Programming (out of 3)_3,Data Science (out of 3)_0,...,Current Year Of Graduation_2021,Current Year Of Graduation_2022,Current Year Of Graduation_2023,skillsown_0,skillsown_3,skillsown_6,skillsown_9,skillsown_12,skillsown_15,skillsown_18
0,No,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,No,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,No,0,0,1,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,No,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,No,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [16]:
df_dummy.Approval.value_counts()

No     567
Yes     44
Name: Approval, dtype: int64

### Balancing the data

In [17]:
df_majority = df_dummy[df_dummy.Approval=='No']
df_minority = df_dummy[df_dummy.Approval=='Yes']
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=567,    # to match majority class
                                 random_state=101) # reproducible results
df_balance = pd.concat([df_majority, df_minority_upsampled])

In [18]:
df_balance.Approval.value_counts()

Yes    567
No     567
Name: Approval, dtype: int64

In [19]:
df_balance

Unnamed: 0,Approval,Python (out of 3)_0,Python (out of 3)_1,Python (out of 3)_2,Python (out of 3)_3,R Programming (out of 3)_0,R Programming (out of 3)_1,R Programming (out of 3)_2,R Programming (out of 3)_3,Data Science (out of 3)_0,...,Current Year Of Graduation_2021,Current Year Of Graduation_2022,Current Year Of Graduation_2023,skillsown_0,skillsown_3,skillsown_6,skillsown_9,skillsown_12,skillsown_15,skillsown_18
0,No,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,No,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,No,0,0,1,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,No,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,No,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [20]:
df_balance.to_csv('final_dummify_balanced_data.csv')

## Modeling Part

In [21]:
xtr,xte,ytr,yte = train_test_split(df_balance[df_balance.columns[1:]],df_balance['Approval'],test_size=0.3,random_state=101)

In [22]:
xtr_o,xte_o,ytr_o,yte_o = train_test_split(df_dummy[df_dummy.columns[1:]],df_dummy['Approval'],test_size=0.3,random_state=101)

In [23]:
xtr_o.shape,xte_o.shape

((427, 38), (184, 38))

In [24]:
xtr.shape,xte.shape

((793, 38), (341, 38))

### Decission Tree

In [25]:
def treebuild(cri,mxd,minsl,rs,spl):
    Approve_Tree = DecisionTreeClassifier(criterion=cri,max_depth=mxd,min_samples_leaf=minsl,random_state=rs,splitter=spl)
    Approve_Tree.fit(xtr,ytr)
    pred_bal = Approve_Tree.predict(xte)
    pred_ = Approve_Tree.predict(xte_o)
    prr_bal = Approve_Tree.predict(xtr)
    prr_ = Approve_Tree.predict(xtr_o)
    print(len(yte),len(pred_))
    print("Test Accuracy original Data",np.mean(yte_o==pred_))
    
    print("Train Accuracy Original Data",np.mean(ytr_o==prr_))
    print ("Test Accuracy Balanced Data",np.mean(yte==pred_bal))
#     print(classification_report(Y_test_lab_bal,pred_bal))
    print("Train Accuracy Balanced Data",np.mean(ytr==prr_bal))
    print(classification_report(yte_o,pred_))
interact(treebuild,cri=['entropy','gini'],mxd=[i for i in range(1,20)],minsl=[i for i in range(1,10)],rs=[i for i in  range(30)],spl=['best','random'])

interactive(children=(Dropdown(description='cri', options=('entropy', 'gini'), value='entropy'), Dropdown(desc…

<function __main__.treebuild(cri, mxd, minsl, rs, spl)>

In [26]:
## final model

In [27]:
Approve_Tree_final = DecisionTreeClassifier(criterion='entropy',max_depth=10,min_samples_leaf=1,random_state=4,splitter='random')
Approve_Tree_final.fit(xtr,ytr)
pred_bal = Approve_Tree_final.predict(xte)
pred_ = Approve_Tree_final.predict(xte_o)
prr_bal = Approve_Tree_final.predict(xtr)
prr_ = Approve_Tree_final.predict(xtr_o)
print(len(yte),len(pred_))
print("Test Accuracy original Data",np.mean(yte_o==pred_))

print("Train Accuracy Original Data",np.mean(ytr_o==prr_))
print ("Test Accuracy Balanced Data",np.mean(yte==pred_bal))
print("Train Accuracy Balanced Data",np.mean(ytr==prr_bal))
print(classification_report(yte_o,pred_))

341 184
Test Accuracy original Data 0.9945652173913043
Train Accuracy Original Data 0.9976580796252927
Test Accuracy Balanced Data 0.9970674486803519
Train Accuracy Balanced Data 0.9987389659520807
              precision    recall  f1-score   support

          No       1.00      0.99      1.00       172
         Yes       0.92      1.00      0.96        12

   micro avg       0.99      0.99      0.99       184
   macro avg       0.96      1.00      0.98       184
weighted avg       0.99      0.99      0.99       184



In [28]:
Approve_Tree_final = DecisionTreeClassifier(criterion='entropy',max_depth=10,min_samples_leaf=1,random_state=4,splitter='random')
Approve_Tree_final.fit(xtr.values,ytr.values)
pred_bal = Approve_Tree_final.predict([xte.values[1]])
print(pred_bal)

['Yes']


In [29]:
def modeldtree(xtr,ytr,xte):
    Approve_Tree_final = DecisionTreeClassifier(criterion='entropy',max_depth=10,min_samples_leaf=1,random_state=4,splitter='random')
    Approve_Tree_final.fit(xtr.values,ytr.values)
    pred_ = Approve_Tree_final.predict(xte)
    return pred_ 

In [None]:
modeldtree(xtr,ytr,xte.values)

In [None]:
df_s.columns

In [None]:
A = {'Python (out of 3)':1, 'R Programming (out of 3)':2,
       'Data Science (out of 3)':3, 'Current Year Of Graduation':2019, 'skillsown':9,
       'Approval':'No'}
A = pd.Series(A)

In [None]:
A

In [None]:
pd.concat([df_s.head(),A],axis=0)

In [None]:
df_s.iloc[-1]

## Create pickel of final model

In [None]:
import pickle

In [None]:
pickle.dump(Approve_Tree_final, open("Approve_Tree_final", 'wb'))

In [None]:
model = pickle.load(open("Approve_Tree_final", 'rb'))

In [None]:
[xte.values[1]]

In [None]:
if model.predict([xte.values[1]])=="Yes" :
    print("Approved")