In [1]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import psycopg2

In [3]:
#Setting up connection from SQL database to load in model data
db_string = f"postgresql://postgres:{db_password}@localhost:5432/Final_Project"
# Create the connection
engine = create_engine(db_string)

In [4]:
data = pd.read_sql('''SELECT * FROM MODEL_DATA''', con = engine)
data

Unnamed: 0,admit,gre,gpa,rank
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2
399,0,600.0,3.89,3


In [27]:
#Alternative way to load in dataset using csv instead of pgadmin SQL database connection
#file_path = "./Resources/graduate_model_data_test.csv"
#data = pd.read_csv(file_path)
#data.head()

In [5]:
data = data.rename(columns = {"rank": "undergrad_school_rank"})
data

Unnamed: 0,admit,gre,gpa,undergrad_school_rank
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2
399,0,600.0,3.89,3


In [6]:
data.dtypes

admit                      int64
gre                      float64
gpa                      float64
undergrad_school_rank      int64
dtype: object

In [7]:
data = data.dropna()
data

Unnamed: 0,admit,gre,gpa,undergrad_school_rank
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
395,0,620.0,4.00,2
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2


In [8]:
X = data[['gre','gpa','undergrad_school_rank']]
y = data['admit']
print(X.shape, y. shape)

(400, 3) (400,)


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=1)

In [10]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight = 'balanced')
classifier

LogisticRegression(class_weight='balanced')

In [11]:
classifier.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [12]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.60625
Testing Data Score: 0.5375


In [13]:
predictions = classifier.predict(X_test)
outcome = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predict1 = outcome.loc[outcome["Prediction"] == 1]
predict1


Unnamed: 0,Prediction,Actual
398,1,0
328,1,0
291,1,0
29,1,0
284,1,1
372,1,1
188,1,0
321,1,0
5,1,1
78,1,0


In [14]:
actual1 = outcome.loc[outcome["Actual"] == 1]
actual1

Unnamed: 0,Prediction,Actual
339,0,1
197,0,1
284,1,1
372,1,1
371,0,1
5,1,1
242,0,1
214,1,1
201,1,1
361,1,1


In [15]:
# Scale the data

In [16]:
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [17]:
X_data_scaled = data_scaler.fit_transform(X)
X_data_scaled[:5]

array([[-1.80026271,  0.57907221,  0.54596793],
       [ 0.62666824,  0.73692924,  0.54596793],
       [ 1.84013372,  1.60514292, -1.57429586],
       [ 0.45331603, -0.52592701,  1.60609982],
       [-0.58679723, -1.20997415,  1.60609982]])

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data_scaled, y, train_size = 0.8, random_state=1)

In [19]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight = 'balanced')
classifier

LogisticRegression(class_weight='balanced')

In [20]:
classifier.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [21]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6375
Testing Data Score: 0.525


In [22]:
predictions = classifier.predict(X_test)
outcome = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
predict1 = outcome.loc[outcome["Prediction"] == 1]
predict1

Unnamed: 0,Prediction,Actual
398,1,0
328,1,0
291,1,0
29,1,0
372,1,1
321,1,0
371,1,1
5,1,1
78,1,0
223,1,0


In [23]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5828262339418526

In [24]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[19, 32],
       [ 6, 23]], dtype=int64)

In [25]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.37      0.79      0.50      0.54      0.28        51
          1       0.42      0.79      0.37      0.55      0.54      0.31        29

avg / total       0.64      0.53      0.64      0.52      0.54      0.29        80



In [28]:
# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.37      0.50        51
           1       0.42      0.79      0.55        29

    accuracy                           0.53        80
   macro avg       0.59      0.58      0.52        80
weighted avg       0.64      0.53      0.52        80

