In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import psycopg2

# Load the dataset

In [28]:
#data from: https://www.kaggle.com/malapatiravi/graduate-school-admission-data/home

In [29]:
#Setting up connection from SQL database to load in model data
db_string = f"postgresql://postgres:{db_password}@localhost:5432/Final_Project"
# Create the connection
engine = create_engine(db_string)

In [30]:
data = pd.read_sql('''SELECT * FROM CLEAN_MODEL_DATA''', con = engine)
data

Unnamed: 0,admit,gre,gpa,rank
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
395,0,620.0,4.00,2
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2


In [31]:
#Alternative way to load in dataset using csv instead of pgadmin SQL database connection
#file_path = "./Resources/graduate_model_data_test.csv"
#data = pd.read_csv(file_path)
#data.head()

In [32]:
data.dtypes

admit      int64
gre      float64
gpa      float64
rank       int64
dtype: object

In [33]:
data = data.dropna()
data

Unnamed: 0,admit,gre,gpa,rank
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
395,0,620.0,4.00,2
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2


In [34]:
admits = data.loc[(data["admit"] == 0)]
len(admits)

273

In [35]:
admits = data.loc[(data["admit"] == 1)]
len(admits)

127

# Feature Selection: Separate dependent variables from independent target variable 

In [36]:
X = data[['gre','gpa','rank']]
y = data['admit']
print(X.shape, y. shape)

(400, 3) (400,)


# Model Iteration #1

# Split the data into training and testing sets

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=1)

# Load the model

In [39]:
# Split the data into training and testing sets
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight = 'balanced')
classifier

LogisticRegression(class_weight='balanced')

# Train the model

In [40]:
classifier.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [41]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.60625
Testing Data Score: 0.5375


# Predict using the test data

In [42]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
398,1,0
125,0,0
328,1,0
339,0,1
172,0,0
...,...,...
347,1,0
41,1,1
180,0,0
132,1,0


## Prediction vs. Outcome when Prediction = 1

In [44]:
predictions = classifier.predict(X_test)
outcome = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predict1 = outcome.loc[outcome["Prediction"] == 1]
predict1


Unnamed: 0,Prediction,Actual
398,1,0
328,1,0
291,1,0
29,1,0
284,1,1
372,1,1
188,1,0
321,1,0
5,1,1
78,1,0


## Prediction vs. Outcome when Prediction = 0

In [43]:
predictions = classifier.predict(X_test)
outcome = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predict1 = outcome.loc[outcome["Prediction"] == 0]
predict1

Unnamed: 0,Prediction,Actual
125,0,0
339,0,1
172,0,0
342,0,0
197,0,1
174,0,0
324,0,0
227,0,0
371,0,1
223,0,0


# Model Iteration #2 -- Same as Model #1 except scaling the data using StandardScaler

In [14]:
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

# Scale data using StandardScaler

In [15]:
X_data_scaled = data_scaler.fit_transform(X)
X_data_scaled[:5]

array([[-1.80026271,  0.57907221,  0.54596793],
       [ 0.62666824,  0.73692924,  0.54596793],
       [ 1.84013372,  1.60514292, -1.57429586],
       [ 0.45331603, -0.52592701,  1.60609982],
       [-0.58679723, -1.20997415,  1.60609982]])

# Split into testing & training data

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data_scaled, y, train_size = 0.8, random_state=1)

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight = 'balanced')
classifier

LogisticRegression(class_weight='balanced')

# Fit Logistic Regression model to training data

In [18]:
classifier.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [19]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6375
Testing Data Score: 0.525


# Dataframes of Predictions vs. Actual

## When prediction = 1

In [None]:
predictions = classifier.predict(X_test)
outcome = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predict1 = outcome.loc[outcome["Prediction"] == 1]
predict1# Dataframes of Predictions vs. Actual

# When prediction = 0

In [45]:
predictions = classifier.predict(X_test)
outcome = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predict1 = outcome.loc[outcome["Prediction"] == 0]
predict1

Unnamed: 0,Prediction,Actual
125,0,0
339,0,1
172,0,0
342,0,0
197,0,1
174,0,0
324,0,0
227,0,0
371,0,1
223,0,0


# Accuracy Score

In [21]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5828262339418526

# Confusion Matrix

In [22]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[19, 32],
       [ 6, 23]], dtype=int64)

# Imbalanced and Balanced Classification Reports

In [23]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.37      0.79      0.50      0.54      0.28        51
          1       0.42      0.79      0.37      0.55      0.54      0.31        29

avg / total       0.64      0.53      0.64      0.52      0.54      0.29        80



In [24]:
# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.37      0.50        51
           1       0.42      0.79      0.55        29

    accuracy                           0.53        80
   macro avg       0.59      0.58      0.52        80
weighted avg       0.64      0.53      0.52        80

