## ---- Statistical decision making by Mrunal Bokil ----

### Data understanding and goal

#### Data description :

#### Goal : 

    1.
    2.
    

### Steps involved to analyze the data and reach our goal:

    1. Data loading (loading the files)
    2. Data handling (remove missing values, remove the extra words at the end of the numeric words)
    3. Descriptive statistics (target vs )
    4. Data modeling (Apply ML algorithms such as decision tree, random forest, linear regression)
    5. Estimation and performance (r^2, rmse, roc?)

#### Note: Dataset and dataframes used are exemplary meaning that the code in each cell may not run but syntax and flow can be reused in other projects

### Import libraries and load data

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn import preprocessing

In [None]:
grad_file = pd.read_csv("Admission_Predict.csv")
grad_file.head()
grad_file.shape

### Data cleaning


In [None]:
data_file_app = data_file_app[data_file_app.Reviews.apply(lambda x: x.isnumeric())]
data_file_app['Reviews'] = data_file_app.Reviews.astype(int)

data_file_app['Installs'] = data_file_app['Installs'].apply(lambda x: x.replace('+', '') if '+' in str(x) else x)
data_file_app['Installs'] = data_file_app['Installs'].apply(lambda x: x.replace(',', '') if ',' in str(x) else x)
data_file_app['Installs'] = data_file_app['Installs'].apply(lambda x: int(x))
data_file_app['Installs'] = data_file_app['Installs'].apply(lambda x: float(x)) 

data_file_app['Size'] = data_file_app['Size'].apply(lambda x: str(x).replace('+', '') if '+' in str(x) else x)
data_file_app['Size'] = data_file_app['Size'].apply(lambda x: str(x).replace(',', '') if ',' in str(x) else x)
data_file_app['Size'] = data_file_app['Size'].apply(lambda x: str(x).replace('M', '') if 'M' in str(x) else x)
data_file_app['Size'] = data_file_app['Size'].apply(lambda x: str(x).replace('k', '') if 'k' in str(x) else x)
data_file_app['Size'] = data_file_app['Size'].apply(lambda x: str(x).replace('Varies with device', 'NaN') if 'Varies with device' in str(x) else x)
#data_file_app['Size'] = data_file_app['Size'].apply(lambda x: int(x))
data_file_app['Size'] = data_file_app['Size'].apply(lambda x: float(x)) 

#what is the size of the most of the apps
print("Size of the apps: ", data_file_app['Size'].mode())
print("Maximum size of the apps: ", data_file_app['Size'].max())

data_file_app['Price'] = data_file_app['Price'].apply(lambda x: x.replace('$', '') if '$' in str(x) else x)
#data_file_app['Price'] = data_file_app['Price'].apply(lambda x: int(x))
data_file_app['Price'] = data_file_app['Price'].apply(lambda x: float(x)) 

data_file_app['Rating'] = data_file_app['Rating'].apply(lambda x: float(x) < 5)
#data_file_app['Price'] = data_file_app['Price'].apply(lambda x: int(x))
data_file_app['Rating'] = data_file_app['Rating'].apply(lambda x: float(x)) 
data_file_app.head()

data_file_app['Reviews'] = data_file_app.Reviews.astype(int)
data_file_app['Price'] = data_file_app.Price.astype(object)
data_file_app['Size'] = data_file_app.Size.dropna().astype(int)
data_file_app['Installs'] = data_file_app.Installs.astype(int)

data_file_app.dtypes

### Descriptive statistics

In [None]:
# correlation plot

grad_file_corr = grad_file.drop('Serial No.', 1).corr()
print(grad_file_corr)
sns.heatmap(grad_file_corr, xticklabels = grad_file_corr.columns, yticklabels = grad_file_corr.columns)
plt.show()

In [None]:
# countplot (usually for target)

sns.countplot(x="Type",data=data_file_app)
plt.title('Overall Paid vs free apps')
plt.show()

In [None]:
# bar graph

#data prep
data_file_app1 = data_file_app[data_file_app['Type']=='Paid']
number_of_apps_in_category_typ = data_file_app1['Category'].value_counts().sort_values(ascending=False)
number_of_apps_in_category_typ = number_of_apps_in_category_typ.reset_index()
number_of_apps_in_category_typ.columns = ["Category","Count_cat"]
number_of_apps_in_category_typ['perc'] = (number_of_apps_in_category_typ['Count_cat']/number_of_apps_in_category_typ['Count_cat'].sum())*100
number_of_apps_in_category_typ.head(5)

#plotting the bar graph
plt.figure(figsize=(10,5))
index = np.arange(len(number_of_apps_in_category.Category)) # x-axis
plt.bar(index, number_of_apps_in_category.Count_cat) # y-axis
plt.xlabel('Category', fontsize=12)
plt.ylabel('No. of apps', fontsize=12)
plt.xticks(index, number_of_apps_in_category.Category, fontsize=12, rotation=80)
plt.title('Number of apps per category')
plt.show()

In [None]:
# histogram for gre scores to see what is average GRE score across the data

#grad_file.columns

# first type
sns.distplot(grad_file['GRE Score'])
plt.show()

# second type
grad_file.hist(column = 'GRE Score')
plt.show()

# histogram of all variables together
grad_file.drop('Serial No.', 1).hist(bins = 30, figsize=(20,15))
plt.show()

In [None]:
# describe 

grad_file.describe()

In [None]:
# Scale the data and calcuate one-way ANOVA to see if there is difference between the means

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(grad_file)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = ['Serial No.', 'GRE Score','TOEFL Score','University Rating', 'SOP', 'LOR','CGPA','Research','Chance_admit']
#df_normalized.head()
stats.f_oneway(df_normalized['GRE Score'], df_normalized['TOEFL Score'])

### Feature selection

In [None]:
# Variance Inflation factor

from statsmodels.stats.outliers_influence import variance_inflation_factor

all_input_var = ["GRE Score", "TOEFL Score", "University Rating", "SOP", "CGPA", "Research"]
vif = pd.DataFrame()
vif["features"] = grad_file[all_input_var].columns
vif["vif_score"] = [variance_inflation_factor(grad_file[all_input_var].values, i) for i in range(grad_file[all_input_var].shape[1])]
vif

In [None]:
# Convert bianry variable to dummies using get_dummies

grad_file.head()
X = grad_file[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'CGPA', 'Research']]
Y = grad_file[grad_file.columns[8]]
Y.rename(columns = {'Chance of Admit':'admit_score'}, inplace = True)
rsrch = pd.get_dummies(grad_file.Research, prefix = 'rsrch')
X = pd.concat([X, rsrch], axis=1)
drops = ['Research']
X.drop(drops, inplace=True, axis=1)
X.head()

In [None]:
# recursive feature elimination

from sklearn.feature_selection import RFE

linreg = LinearRegression()
# create the RFE model for the svm classifier 
# and select attributes
rfe = RFE(linreg, 4)
rfe = rfe.fit(grad_file[['GRE Score', 'University Rating', 'SOP', 'CGPA', 'Research']], Y)
# print summaries for the selection of attributes
print(rfe.support_)
print(rfe.ranking_)

In [3]:
# LASSO for feature selection

### Data modeling

In [None]:
# split the data to training and testing

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [None]:
# import all ML algorithm libraries

from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

from sklearn import ensemble
from sklearn.ensemble import GradientBoostingRegressor

from sklearn import tree 
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Linear regression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
print('Linear Regression R squared: %.4f' % regressor.score(X_test, y_test))

lin_mse = mean_squared_error(y_pred, y_test)
lin_rmse = np.sqrt(lin_mse)
print('Linear Regression RMSE: %.4f' % lin_rmse)

In [None]:
# Randomforest

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_train, y_train)

y_pred = forest_reg.predict(X_test)
forest_mse = mean_squared_error(y_pred, y_test)
forest_rmse = np.sqrt(forest_mse)
print('Random Forest RMSE: %.4f' % forest_rmse)

In [None]:
# Randomforest with Grid Search

#from sklearn.grid_search import GridSearchCV
#from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

grid_1 = { 'bootstrap': [True],
           'max_depth': [80, 90, 100, 110],
        'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
   'min_samples_split': [8, 10, 12],
        'n_estimators': [100, 200, 300, 1000]
           }
rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, param_grid = grid_1, n_jobs=-1, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
rfreg1 = RandomForestRegressor(bootstrap= True, max_depth= 100, max_features= 2, min_samples_leaf= 3, min_samples_split= 10,
                               n_estimators= 100)
rfreg1.fit(X_train, y_train)


y_pred = rfreg1.predict(X_test)
print('Linear Regression R squared: %.4f' % rfreg1.score(X_test, y_test))

y_pred = rfreg1.predict(X_test)
rfreg1_mse = mean_squared_error(y_pred, y_test)
rfreg1_rmse = np.sqrt(rfreg1_mse)
print('Random Forest RMSE: %.4f' % rfreg1_rmse)

In [None]:
# GBM

model = ensemble.GradientBoostingRegressor()
model.fit(X_train, y_train)

print('Gradient Boosting R squared: %.4f' % model.score(X_test, y_test))

y_pred = model.predict(X_test)
model_mse = mean_squared_error(y_pred, y_test)
model_rmse = np.sqrt(model_mse)
print('Gradient Boosting RMSE: %.4f' % model_rmse)

In [None]:
# Decision tree

model = tree.DecisionTreeRegressor()
model.fit(X_train, y_train)

print('Decision tree R squared: %.4f' % model.score(X_test, y_test))

y_pred = model.predict(X_test)
model_mse = mean_squared_error(y_pred, y_test)
model_rmse = np.sqrt(model_mse)
print('Decision Tree RMSE: %.4f' % model_rmse)

## Thank you!

In [None]:
# using lasso regression 

from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(X_train,y_train)

train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)

coeff_used = np.sum(lasso.coef_!=0)
print("training score:", train_score) 
print("test score: ", test_score)
print("number of features used: ", coeff_used)
#lasso001 = Lasso(alpha=0.01, max_iter=10e5)
#lasso001.fit(X_train,y_train)

In [None]:
# alpha = 0.01
lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(X_train,y_train)

train_score001=lasso001.score(X_train,y_train)
test_score001=lasso001.score(X_test,y_test)

coeff_used001 = np.sum(lasso001.coef_!=0)

print("training score for alpha=0.01:", train_score001) 
print("test score for alpha =0.01: ", test_score001)
print("number of features used: for alpha =0.01:", coeff_used001)

# alpha = 0.0001
lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(X_train,y_train)

train_score00001=lasso00001.score(X_train,y_train)
test_score00001=lasso00001.score(X_test,y_test)

coeff_used00001 = np.sum(lasso00001.coef_!=0)

print("training score for alpha=0.0001:", train_score00001) 
print("test score for alpha =0.0001: ", test_score00001)
print("number of features used: for alpha =0.0001:", coeff_used00001)


# alpha = 0.05
# lasso00001 = Lasso(alpha=0.05, max_iter=10e5)
# lasso00001.fit(X_train,y_train)

# train_score00001=lasso00001.score(X_train,y_train)
# test_score00001=lasso00001.score(X_test,y_test)

# coeff_used00001 = np.sum(lasso00001.coef_!=0)

# print("training score for alpha=0.05:", train_score00001) 
# print("test score for alpha =0.05: ", test_score00001)
# print("number of features used: for alpha =0.05:", coeff_used00001)

# alpha = 0.000001
# lasso00001 = Lasso(alpha=0.000001, max_iter=10e5)
# lasso00001.fit(X_train,y_train)

# train_score00001=lasso00001.score(X_train,y_train)
# test_score00001=lasso00001.score(X_test,y_test)

# coeff_used00001 = np.sum(lasso00001.coef_!=0)

# print("training score for alpha=0.000001:", train_score00001) 
# print("test score for alpha =0.000001: ", test_score00001)
# print("number of features used: for alpha =0.000001:", coeff_used00001)

y_pred = lasso00001.predict(X_test)
print('Linear Regression R squared: %.4f' % lasso00001.score(X_test, y_test))

import numpy as np
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_pred, y_test)
lin_rmse = np.sqrt(lin_mse)
print('Linear Regression RMSE: %.4f' % lin_rmse)