In [1]:
import numpy as np
import matplotlib.pyplot as plt  # To visualize
import pandas as pd  # To read data
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML
import seaborn as sn


In [2]:
data = pd.read_stata('berkeley.dta', preserve_dtypes=False)

In [3]:
data['gender'].describe()

count     4526
unique       2
top       Male
freq      2691
Name: gender, dtype: object

In [4]:
gender = {'Male': 0,'Female': 1}
data.gender = [gender[item] for item in data.gender]
admit = {'Rejected': 0,'Admitted': 1}
data.admit = [admit[item] for item in data.admit]
dept = {'A': 0, 'B': 1, 'C':2, 'D': 3, 'E': 4, 'F': 5}
data.dept = [dept[item] for item in data.dept]


In [5]:
onlywomen = data.loc[data['gender'] == 1].copy()
onlywomen['admit'].describe()

count    1835.000000
mean        0.303542
std         0.459913
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: admit, dtype: float64

In [6]:
data.agg(
    {
        "gender": ["min", "max", "median", "skew"],
        "admit": ["min", "max", "median", "mean"],
    }
)

Unnamed: 0,gender,admit
min,0.0,0.0
max,1.0,1.0
median,0.0,0.0
skew,0.385339,
mean,,0.38776


In [11]:
data.dtypes

applicant     int64
admit         int64
gender        int64
dept         object
dtype: object

In [38]:
LR = LinearRegression()  # create object for the class


In [12]:
X = data['gender'].values.reshape(-1, 1)  # values converts it into a numpy array
y = data['admit'].values.reshape(-1, 1)  # values converts it into a numpy array


In [59]:
LR.fit(X, y)

LinearRegression()

In [57]:
print('Coefficients: \n', LR.coef_)


Coefficients: 
 [[-0.14164543]]


In [13]:
X = sm.add_constant(X)


In [18]:
model = sm.OLS(data['admit'], data['gender']).fit()

In [19]:
model.summary()

0,1,2,3
Dep. Variable:,admit,R-squared (uncentered):,0.096
Model:,OLS,Adj. R-squared (uncentered):,0.096
Method:,Least Squares,F-statistic:,482.4
Date:,"Sun, 10 Oct 2021",Prob (F-statistic):,1.1099999999999999e-101
Time:,18:52:17,Log-Likelihood:,-4049.0
No. Observations:,4526,AIC:,8100.0
Df Residuals:,4525,BIC:,8106.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
gender,0.3035,0.014,21.964,0.000,0.276,0.331

0,1,2,3
Omnibus:,24363.738,Durbin-Watson:,0.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,574.437
Skew:,0.385,Prob(JB):,1.83e-125
Kurtosis:,1.434,Cond. No.,1.0


In [16]:
data['dept'].unique()

array(['A', 'B', 'C', 'D', 'E', 'F'], dtype=object)

In [20]:
data.describe()

Unnamed: 0,applicant,admit,gender,dept
count,4526.0,4526.0,4526.0,4526.0
mean,2263.5,0.38776,0.405435,2.364781
std,1306.687989,0.487293,0.49103,1.712402
min,1.0,0.0,0.0,0.0
25%,1132.25,0.0,0.0,1.0
50%,2263.5,0.0,0.0,2.0
75%,3394.75,1.0,1.0,4.0
max,4526.0,1.0,1.0,5.0


In [None]:
Income_Gini = sm.OLS(data['Gini'], sm.add_constant(data['Income_Per_Capita'])).fit()
