In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/

import pickle
from tqdm import tqdm
import os

# from plotly import plotly
# import plotly.offline as offline
# import plotly.graph_objs as go
# offline.init_notebook_mode()
from collections import Counter

#### Loading data

In [2]:
data = pd.read_csv('preprocessed_data.csv', nrows=5000)

In [3]:
data.head()

Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,725.05
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,213.03
2,ca,mrs,grades_prek_2,10,1,literacy_language,literacy,having class 24 students comes diverse learner...,329.0
3,ga,mrs,grades_prek_2,2,1,appliedlearning,earlydevelopment,i recently read article giving students choice...,481.04
4,wa,mrs,grades_3_5,2,1,literacy_language,literacy,my students crave challenge eat obstacles brea...,17.74


In [5]:
data.shape

(5000, 9)

#### Splitting data

In [10]:
y = data['project_is_approved'].values
X = data.drop(['project_is_approved'], axis=1)


Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,clean_categories,clean_subcategories,essay,price
0,ca,mrs,grades_prek_2,53,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,725.05


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.33)

### Encoding categorical features

In [12]:
vectorizer = CountVectorizer(min_df=10, ngram_range=(1,4), max_features=5000)
#performing fit only on train data
vectorizer.fit(X_train['essay'].values) 

X_train_essay_enc = vectorizer.transform(X_train['essay'].values)
X_cv_essay_enc = vectorizer.transform(X_cv['essay'].values)
X_test_essay_enc = vectorizer.transform(X_test['essay'].values)

In [14]:
X_train_state_enc = vectorizer.fit_transform(X_train['school_state'].values)
X_cv_state_enc = vectorizer.transform(X_cv['school_state'].values)
X_test_state_enc = vectorizer.transform(X_test['school_state'].values)

In [15]:
X_train_teacher_prefix_enc = vectorizer.fit_transform(X_train['teacher_prefix'].values)
X_cv_teacher_prefix_enc = vectorizer.transform(X_cv['teacher_prefix'].values)
X_test_teacher_prefix_enc = vectorizer.transform(X_test['teacher_prefix'].values)

In [16]:
X_train_project_grade_category_enc = vectorizer.fit_transform(X_train['project_grade_category'].values)
X_cv_project_grade_category_enc = vectorizer.transform(X_cv['project_grade_category'].values)
X_test_project_grade_category_enc = vectorizer.transform(X_test['project_grade_category'].values)

In [18]:
X_train_clean_category_enc = vectorizer.fit_transform(X_train['clean_categories'].values)
X_cv_clean_category_enc = vectorizer.transform(X_cv['clean_categories'].values)
X_test_clean_category_enc = vectorizer.transform(X_test['clean_categories'].values)

In [19]:
X_train_clean_subcategory_enc = vectorizer.fit_transform(X_train['clean_subcategories'].values)
X_cv_clean_subcategory_enc = vectorizer.transform(X_cv['clean_subcategories'].values)
X_test_clean_subcategory_enc = vectorizer.transform(X_test['clean_subcategories'].values)

### Encoding numerical features

In [28]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()

normalizer = Normalizer()
normalizer.fit(X_train['price'].values.reshape(-1,1))

X_train_price_norm = normalizer.transform(X_train['price'].values.reshape(-1,1))
X_cv_price_norm = normalizer.transform(X_cv['price'].values.reshape(-1,1))
X_test_price_norm = normalizer.transform(X_test['price'].values.reshape(-1,1))

In [32]:
print(X_train_price_norm.shape, y_train.shape)
print(X_test_price_norm.shape, y_test.shape)
print(X_cv_price_norm.shape, y_cv.shape)

(2244, 1) (2244,)
(1650, 1) (1650,)
(1106, 1) (1106,)


In [33]:
normalizer.fit(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))

X_train_tprojects_norm = normalizer.transform(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_cv_tprojects_norm = normalizer.transform(X_cv['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_test_tprojects_norm = normalizer.transform(X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))

In [34]:
#### Concatenating everything
from scipy.sparse import hstack

X_tr = hstack((X_train_essay_enc, X_train_state_enc, X_train_project_grade_category_enc, X_train_teacher_prefix_enc, X_train_clean_category_enc, X_train_clean_subcategory_enc, X_train_price_norm, X_train_tprojects_norm)).tocsr()
X_cr = hstack((X_cv_essay_enc, X_cv_state_enc, X_cv_project_grade_category_enc, X_cv_teacher_prefix_enc, X_cv_clean_category_enc, X_cv_clean_subcategory_enc, X_cv_price_norm, X_cv_tprojects_norm)).tocsr()
X_te = hstack((X_test_essay_enc, X_test_state_enc, X_test_project_grade_category_enc, X_test_teacher_prefix_enc, X_test_clean_category_enc, X_test_clean_subcategory_enc, X_test_price_norm, X_test_tprojects_norm)).tocsr()

In [35]:
print(X_tr.shape, y_train.shape)
print(X_cr.shape, y_cv.shape)
print(X_te.shape, y_test.shape)

(2244, 5131) (2244,)
(1106, 5131) (1106,)
(1650, 5131) (1650,)
