In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, log_loss
import pickle

In [3]:
# Import data
df = pd.read_stata('primary_sample.csv')

# Extract variables of interest
df = df[['year', 'school', 'admit', 'gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl']]

# Drop observations with missing variables
df = df.dropna(axis='index') # Drop missing

# Convert year, school to dummies for TWFE
df = pd.get_dummies(df, columns=['year', 'school'], drop_first=True) # First column is dropped to prevent collinearity

# Clean up
df = df.replace(['False', 'True'], [0, 1])

In [4]:
# Define features and outcome
y = df['admit'].ravel()
X = df.drop(['admit'], axis=1)

In [5]:
# Define model
model = LogisticRegression(n_jobs=-1, max_iter=10000, solver='sag')
model.fit(X, y)

# Predict admit
y_hat = np.array([i for i in model.predict(X)])

# Print outputs
print('Coefficients')
[print(a, ':', round(b, 3)) for a, b in zip(model.feature_names_in_[0:6], model.coef_.flatten()[0:6])]
print('')

print('Intercept:', round(model.intercept_.item(), 3), '\n')

print('Goodness of Fit')
print('Cross Entropy (Log Loss):', round(log_loss(y, y_hat), 3))
print('Accuracy:', round(model.score(X, y), 3))
print('MSE:', round(mean_squared_error(y, y_hat), 3))

Coefficients
gpa : 3.424
lsat : 0.315
urm : 1.341
fee_waived : 0.334
non_trad : -0.076
intl : -0.623

Intercept: -59.135 

Goodness of Fit
Cross Entropy (Log Loss): 4.314
Accuracy: 0.88
MSE: 0.12


In [6]:
# Save model
pickle.dump(model, open('logit.sav', 'wb'))

# Save columns
pickle.dump(list(df.columns), open('columns.pkl', 'wb'))