In [1]:
import pandas as pd
data_url= "http://www-stat.wharton.upenn.edu/~waterman/DataSets/uva.txt"

df = pd.read_table(data_url)
df[:5]

Unnamed: 0,who,Newbie,Age,Gender,Household Income,Sexual Preference,Country,Education Attainment,Major Occupation,Marital Status,Years on Internet
0,id74364,0,54.0,Male,$50-74,Gay male,Ontario,Some College,Computer,Other,4-6 yr
1,id84505,0,39.0,Female,Over $100,Heterosexual,Sweden,Professional,Other,Other,1-3 yr
2,id84509,1,49.0,Female,$40-49,Heterosexual,Washington,Some College,Management,Other,Under 6 mo
3,id87028,1,22.0,Female,$40-49,Heterosexual,Florida,Some College,Computer,Married,6-12 mo
4,id76087,0,20.0,Male,$30-39,Bisexual,New Jersey,Some College,Education,Single,1-3 yr


In [2]:
df.pop('who')
df.pop('Country')
df.pop('Years on Internet')

df.dtypes

Newbie                    int64
Age                     float64
Gender                   object
Household Income         object
Sexual Preference        object
Education Attainment     object
Major Occupation         object
Marital Status           object
dtype: object

In [3]:
category_cols = ["Gender", 'Household Income',
                 'Sexual Preference', 'Education Attainment',
                 'Major Occupation', "Marital Status"]

for col in category_cols:
    df[col] = df[col].astype('category')
    
df.dtypes

Newbie                     int64
Age                      float64
Gender                  category
Household Income        category
Sexual Preference       category
Education Attainment    category
Major Occupation        category
Marital Status          category
dtype: object

In [17]:
df_onehot = pd.get_dummies(df)
# df_onehot.shape
df_onehot

Unnamed: 0,Newbie,Age,Gender_Female,Gender_Male,Household Income_$10-19,Household Income_$20-29,Household Income_$30-39,Household Income_$40-49,Household Income_$50-74,Household Income_$75-99,...,Major Occupation_Education,Major Occupation_Management,Major Occupation_Other,Major Occupation_Professional,Marital Status_Divorced,Marital Status_Married,Marital Status_Other,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,0,54.0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0,39.0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,49.0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
3,1,22.0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,20.0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19578,0,22.0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
19579,0,19.0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
19580,0,49.0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
19581,1,42.0,1,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


In [5]:
df_onehot.isnull().sum()

Newbie                                 0
Age                                  561
Gender_Female                          0
Gender_Male                            0
Household Income_$10-19                0
Household Income_$20-29                0
Household Income_$30-39                0
Household Income_$40-49                0
Household Income_$50-74                0
Household Income_$75-99                0
Household Income_Over $100             0
Household Income_Under $10             0
Sexual Preference_Bisexual             0
Sexual Preference_Gay male             0
Sexual Preference_Heterosexual         0
Sexual Preference_Lesbian              0
Sexual Preference_Transgender          0
Sexual Preference_na                   0
Education Attainment_College           0
Education Attainment_Doctoral          0
Education Attainment_Grammar           0
Education Attainment_High School       0
Education Attainment_Masters           0
Education Attainment_Other             0
Education Attain

In [6]:
df_onehot.loc[
    pd.isnull(df_onehot['Age']), "Age"] = df_onehot['Age'].mean()

In [18]:
x_data = df_onehot.iloc[:, 1:].values
y_data = df_onehot.iloc[:, 0].values.reshape(-1, 1)
y_data.shape, x_data.shape

((19583, 1), (19583, 37))

In [19]:
from sklearn import preprocessing # Min-Max Standardzation

min_max_scaler = preprocessing.MinMaxScaler()
x_data = min_max_scaler.fit_transform(x_data)
x_data

array([[0.65333333, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.45333333, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.58666667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.58666667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.49333333, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25333333, 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ]])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x_data, y_data, test_size=0.33, random_state=42)

X_train.shape, X_test.shape

((13120, 37), (6463, 37))

In [10]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(fit_intercept=True)
logreg.fit(X_train, y_train.flatten())

In [11]:
LogisticRegression(C=1.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1,
                   l1_ratio=None, max_iter=100, multi_class='warn',
                   n_jobs=-1, penalty='l2', random_state=None,
                   solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
logreg.predict(X_test[:5])

array([0, 0, 0, 0, 0], dtype=int64)

In [13]:
logreg.predict_proba(X_test[:5])

array([[0.56843258, 0.43156742],
       [0.91112572, 0.08887428],
       [0.79481085, 0.20518915],
       [0.85841562, 0.14158438],
       [0.62764603, 0.37235397]])

In [14]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

y_true = y_test.copy()
y_pred = logreg.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[4487,  275],
       [1350,  351]], dtype=int64)

In [15]:
accuracy_score(y_true, y_pred)

0.7485687761101656