In [1]:
import random
import numpy as np
import pandas as pd

In [2]:
csv_name = "./binary.csv"
df = pd.read_csv(csv_name)
df.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [3]:
# make dummy variables for rank
data = pd.concat([df, pd.get_dummies(df["rank"], prefix="rank")], axis=1)
data.head()

Unnamed: 0,admit,gre,gpa,rank,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,3,0,0,1,0
1,1,660,3.67,3,0,0,1,0
2,1,800,4.0,1,1,0,0,0
3,1,640,3.19,4,0,0,0,1
4,0,520,2.93,4,0,0,0,1


In [4]:
data = data.drop("rank", axis=1)
data.head()

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,0,0,1,0
1,1,660,3.67,0,0,1,0
2,1,800,4.0,1,0,0,0
3,1,640,3.19,0,0,0,1
4,0,520,2.93,0,0,0,1


In [5]:
# standardize gre and gpa
for field in ("gre", "gpa"):
	mean, std = data[field].mean(), data[field].std()
	data.loc[:,field] = (data[field]-mean)/std
data.head()

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,-1.798011,0.578348,0,0,1,0
1,1,0.625884,0.736008,0,0,1,0
2,1,1.837832,1.603135,1,0,0,0
3,1,0.452749,-0.525269,0,0,0,1
4,0,-0.586063,-1.208461,0,0,0,1


In [6]:
# split off random 10% of the data for testing
random.seed(2018)
sample = random.sample(data.index, k=int(len(data)*0.9))
print(type(sample), len(sample))
data, test_data = data.iloc[sample], data.drop(sample)
print(len(data), len(test_data))

(<type 'list'>, 360)
(360, 40)


In [7]:
# Split into features and targets
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

In [10]:
n_features = len(features.columns)
weights = np.random.normal(scale=1/n_features**0.5, size=n_features)
print(weights)

[ 0.43305888 -0.00384952  0.29443075 -0.44724236 -0.44002037  0.3160498 ]


In [23]:
for index, row in features.iterrows():
    #print(index, row)
    np.dot(weights, row)

In [20]:
print(features.shape)

(360, 6)
