# In-House Enrollment Analytics 

<H3>Recreating and improving upon 3rd party analytics</H3>

<H3>by Michael Greene, PhD</H3>

This report is a brief summary (with comments) of the code used to develop a logistic regression model which was used to predict if admitted students will enroll at our University. 

Let's begin by importing some packages:

In [18]:
import pandas as pd 
import numpy as np 
# import GreeneLib as gl ## This is a custom library I built with a few functions I use reguarly
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [19]:
## -- Load and Merge -- ##

df = pd.read_csv("Data1.csv", header=0, sep=",")
df_acad = pd.read_csv("Data2.csv", header=0, sep=",")
df = pd.merge(df, df_acad, how="left", left_on="TCU ID", right_on="StudentID")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5315 entries, 0 to 5314
Data columns (total 15 columns):
TCU ID                          5315 non-null int64
App Demonstrated Interest       5287 non-null float64
TCU Deposit IQ Response         1221 non-null float64
Deposit IQ Response             4094 non-null float64
TCU Excitement Reasons          92 non-null object
TCU Deposit IQ Decision Help    849 non-null object
OtherDescription                130 non-null object
TCU Deposit IQ Contact Help     681 non-null object
Comments                        220 non-null object
OtherDescription2               10 non-null object
Deposited                       5315 non-null int64
Test_Train                      5315 non-null object
StudentID                       5217 non-null float64
ACRK                            5217 non-null float64
Semester                        5217 non-null object
dtypes: float64(5), int64(2), object(8)
memory usage: 664.4+ KB
None


Now to cleanup the results

In [20]:
def GreeneDytpe(dataframe, dtype_dict):
    """
    This function takes a dataframe as the first argument, and a dictionary of dtypes 
    as keys and list of columns as values and converts the given df to a new df with 
    provided dtypes
    """
    for dt, col in dtype_dict.items():
        for c in col:
            dataframe[c] = dataframe[c].astype(dt)
    print(dataframe.info())
    return(dataframe)

In [29]:
## -- Cleanup -- ##
df["Deposit_IQ_Response"] = df["Deposit IQ Response"].fillna(df["TCU Deposit IQ Response"])

ddict = {
    "category":["TCU Excitement Reasons", "TCU Deposit IQ Decision Help", 
                "TCU Deposit IQ Contact Help", 
                "Deposited", "Test_Train"],
    "float":["ACRK", "Deposit IQ Response", "TCU Deposit IQ Response",
             "App Demonstrated Interest"],
    "int8":["Deposit_IQ_Response"],
    "object":["TCU ID", "StudentID"]
    }


df = GreeneDytpe(df, ddict)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5085 entries, 0 to 5084
Data columns (total 16 columns):
TCU ID                          5085 non-null object
App Demonstrated Interest       5057 non-null float64
TCU Deposit IQ Response         1221 non-null float64
Deposit IQ Response             3864 non-null float64
TCU Excitement Reasons          92 non-null category
TCU Deposit IQ Decision Help    849 non-null category
OtherDescription                130 non-null object
TCU Deposit IQ Contact Help     681 non-null category
Comments                        220 non-null object
OtherDescription2               10 non-null object
Deposited                       5085 non-null category
Test_Train                      5085 non-null category
StudentID                       4995 non-null object
ACRK                            4995 non-null float64
Semester                        4995 non-null object
Deposit_IQ_Response             5085 non-null int8
dtypes: category(5), float64(4), int8(1),

In [22]:
## -- Define Factors and Targets -- ##
factors = ["App Demonstrated Interest",
               "Deposit_IQ_Response", 
               "ACRK", 
               ]
target = ["Deposited"]

# - Remove people who have already deposited from training - #
df = df[df["Deposit_IQ_Response"] != 5].reset_index(drop=True)

## - Splitting test/train -- ##
df_train = df[df["Test_Train"]=="TRAIN"].loc[:,factors+target
                    ].dropna(how="any", axis=0
                    ).reset_index(drop=True)
df_test = df[df["Test_Train"]=="TEST"].loc[:,factors+target
                    ].dropna(how="any", axis=0
                    ).reset_index(drop=True)

In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df_train[target])

X = df_train.drop(target[0], axis=1)
X = pd.get_dummies(X, drop_first=True)
print(X.shape)
print(y.shape)

print(X.info())

(3808, 3)
(3808,)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3808 entries, 0 to 3807
Data columns (total 3 columns):
App Demonstrated Interest    3808 non-null float64
Deposit_IQ_Response          3808 non-null int8
ACRK                         3808 non-null float64
dtypes: float64(2), int8(1)
memory usage: 63.3 KB
None


  y = column_or_1d(y, warn=True)


In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=0)#, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## -- Logistic Classification -- ##
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, 
                             fit_intercept=True, intercept_scaling=1, class_weight="balanced", 
                             random_state=0, multi_class='auto', 
                             verbose=0, warm_start=False, n_jobs=None)
cv = 5
grid = {"C":np.logspace(-5,5,7), "penalty":["l1","l2"], "max_iter":[100,200,500]} # l1 lasso l2 ridge

logreg_cv = GridSearchCV(clf, grid, cv=cv, refit=True)
logreg_cv.fit(X_train, y_train)

print("Tuned hpyerparameters (best parameters):", logreg_cv.best_params_)
print("Best Training Accuracy: {trn_a}% (Base={ba}%)".format(trn_a=round(logreg_cv.best_score_*100, 2),
                                                                 ba=round(100-y_train.mean()*100, 2)))

(2551, 3)
(1257, 3)
(2551,)
(1257,)
Tuned hpyerparameters (best parameters): {'C': 1.0, 'max_iter': 100, 'penalty': 'l1'}
Best Training Accuracy: 93.49% (Base=80.28%)


In [25]:
## -- Evaluation Metrics -- ##
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, logreg_cv.predict(X_test))
print(matrix)
print("Testing Accuracy: {tst_a}% (Base={ba}%)".format(tst_a=round(logreg_cv.score(X_test, y_test)*100, 2),
                                                           ba=round(100-y_test.mean()*100, 2)))

print("Classes:",logreg_cv.best_estimator_.classes_) # 0 False, 1 True
coef_list = [round(w, 5) for w in logreg_cv.best_estimator_.coef_[0]]
print("Factor Weights:",list(zip(coef_list, X.columns)))
print("Intercept:", logreg_cv.best_estimator_.intercept_)

from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, logreg_cv.best_estimator_.predict(X_test), pos_label=1)
print("AUC Score:", metrics.auc(fpr, tpr))

[[960  53]
 [ 28 216]]
Testing Accuracy: 93.56% (Base=80.59%)
Classes: [0 1]
Factor Weights: [(0.21608, 'App Demonstrated Interest'), (2.6101, 'Deposit_IQ_Response'), (0.22891, 'ACRK')]
Intercept: [-7.19152611]
AUC Score: 0.9164630297930186


In [26]:
## -- Logistic Regression Equation -- ##
"""
f(x) = 1 / (1 + [exp(-(B0 + B1x1 + B2x2 + B3x3 + ... + BnXn))])
"""
print(X_test[0:3].to_string())
print(logreg_cv.best_estimator_.predict_proba(X_test[0:3])) # 0 is False (will not deposit); 1 is True (will deposit)
print(logreg_cv.best_estimator_.predict(X_test[0:3])) # 0 is False (will not deposit); 1 is True (will deposit)
print("...")

      App Demonstrated Interest  Deposit_IQ_Response  ACRK
1830                        4.0                    2   2.0
725                         3.0                    1   3.0
3804                        3.0                    1   3.0
[[0.65682869 0.34317131]
 [0.96254596 0.03745404]
 [0.96254596 0.03745404]]
[0 0 0]
...


#Thanks for reading 