In [12]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
%matplotlib inline

Lets pick up where we left.

In [13]:
medical = pd.read_pickle("data/medical.p")

In [14]:
# The file includes socio-demographic data, including 
# health insurance and various aspects of health care
# touchpoints for the respondent group of a survey
# conducted in the USA.

# The collection includes 35072 observations and 27 variables:
  
# UMARSTAT – Marital status recode
# UCUREMP – Currently has employer coverage
# UCURNINS – Currently uninsured
# USATMED – Satisfied with quality of medical care
# URELATE – Number of relatives in household
# REGION – region
# STATE - state
# HHID – Household identification number
# FHOSP – In hospital overnight last year
# FDENT – Dental visits last year
# FEMER – Number of emergency room visits last year
# FDOCT – Number of doctor visits last year
# UIMMSTAT – Immigration status
# U_USBORN – U.S.- or foreign-born
# UAGE – Age topcoded
# U_FTPT – Full-time or part-time worker this year
# U_WKSLY – Weeks worked last year
# U_HRSLY – Hours worked per week last year
# U_USHRS – Hours worked per week this year
# HEARNVAL – Earnings amount last year - Household
# HOTHVAL – Household income, total exc. earnings
# HRETVAL – Retirement amount – Household
# HSSVAL – Social Security amount - Household
# HWSVAL – Wages and salaries amount – Household
# UBRACE – race
# GENDER – gender
# UEDUC3 – education level
# CEYES - color of eyes
# CHAIR - color of hair

In [15]:
medical.head()

Unnamed: 0,UMARSTAT,UCUREMP,UCURNINS,USATMED,URELATE,REGION,STATE,HHID,FHOSP,FDENT,...,HEARNVAL,HOTHVAL,HRETVAL,HSSVAL,HWSVAL,UBRACE,GENDER,UEDUC3,CEYES,CHAIR
0,Never married,No,Yes,4,2,Midwest,WI,55616128,No,0,...,0,0,0,0,0,White,Female,0,hazel,brown
1,Separated,Yes,No,4,2,Midwest,WI,54704000,No,2,...,31468,5950,0,0,31468,White,Female,1,blue,black
2,Married_live together,No,No,4,5,Midwest,WI,57874272,No,0,...,24700,11340,0,4920,24700,White,Male,0,brown,brown
3,Divorced,No,Yes,1,4,Midwest,WI,54106816,No,0,...,60000,39002,0,0,60000,Black,Female,0,brown,black
4,Never married,Yes,No,4,0,Midwest,WI,54569152,No,2,...,55280,4200,0,0,55280,Black,Male,1,brown,black


First we will recode UCURNINS to binary form.

In [16]:
print(medical.UCURNINS.unique())
medical["UCURNINS"] = (medical.UCURNINS=="Yes").astype(int)
print(medical.UCURNINS.unique())

['Yes' 'No']
[1 0]


Today we will work with sklearn a lot. Therefore we need to recode all variables with nominal levels to binary form. We will use pandas get_dummies method for that. Lets just get rid of HHID

In [17]:
levCols = []
numCols = []
for col in medical.columns:
    if medical[col].dtype==object:
        levCols.append(col)
    else:
        numCols.append(col)
levCols.remove("HHID")

In [18]:
dummLev = pd.get_dummies(medical[levCols])
dummLev.shape

(35072, 88)

In [19]:
medical = pd.concat([medical[numCols], dummLev], axis=1)

In [20]:
features = medical.columns.tolist()
features.remove("UCURNINS")

In [21]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [22]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
## priors as average values if they are not defined
est = lda.fit(medical[features], medical["UCURNINS"])
preds = est.predict(medical[features])
probs = est.predict_proba(medical[features])
sum(preds == medical["UCURNINS"])/len(preds)



0.8959568886861314

In [23]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True, priors=(0.5, 0.5))
est = lda.fit(medical[features], medical["UCURNINS"])
preds1 = est.predict(medical[features])
probs1 = est.predict_proba(medical[features])
sum(preds1 == medical["UCURNINS"])/len(preds)
## there are defined priors --> accuracy slightly lower



0.8908245894160584

In [24]:
# ldaPreds = pd.DataFrame([preds, preds1]).transpose()
pd.crosstab(preds, preds1)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26179,284
1,0,8609


In [25]:
lda =  QuadraticDiscriminantAnalysis(store_covariances=True)
est = lda.fit(medical[features], medical["UCURNINS"])
preds2 = est.predict(medical[features])
probs2 = est.predict_proba(medical[features])
sum(preds2 == medical["UCURNINS"])/len(preds1)



0.8907105383211679

In [30]:
lda =  QuadraticDiscriminantAnalysis(store_covariances=True, priors=(0.5, 0.5))
est = lda.fit(medical[features], medical["UCURNINS"])
preds3 = est.predict(medical[features])
probs3 = est.predict_proba(medical[features])
sum(preds3 == medical["UCURNINS"])/len(preds1)



0.8907105383211679

In [39]:
pd.crosstab(preds, preds3)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26174,289
1,1,8608


In [40]:
features = medical.columns.tolist()
features.remove("USATMED")

In [41]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
est = lda.fit(medical[features], medical["USATMED"])
preds = est.predict(medical[features])
probs = est.predict_proba(medical[features])
sum(preds == medical["USATMED"])/len(preds)



0.5165374087591241

In [42]:
lda =  QuadraticDiscriminantAnalysis(store_covariances=True)
est = lda.fit(medical[features], medical["USATMED"])
preds1 = est.predict(medical[features])
probs1 = est.predict_proba(medical[features])
sum(preds2 == medical["USATMED"])/len(preds1)



0.039262089416058396

In [43]:
medical["USATMED"].nunique()

5

In [44]:
pd.crosstab(preds, preds1)

col_0,0,1,2,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,87,0,0,0,0
1,6,2,0,0,0
2,683,5,3,8,0
3,3956,95,7,46,0
4,29411,462,6,293,2


In [170]:
#--------------------------------------------------------------------
# Exercises 3.

# Exercise 3.1.

# Titanic passengers data – 1310 observations and 15 variables:

# passenger_id – Unique passenger id
# pclass – Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
# survived – Survival (0 = No, 1 = Yes)
# name – Name and SUrname
# sex – Sex (0 = Male, 1 = Female)
# age – Age in years
# sibsp – # of siblings / spouses aboard the Titanic
# parch – # of parents / children aboard the Titanic
# ticket – Ticket number
# fare – Passenger fare
# cabin – Cabin number
# embarked – Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
# boat – Lifeboat (if survived)
# body – Body number (if did not survive and body was recovered)
# home.dest – Home/Destination

# Use linear and quadratic discriminant analysis to
# explain the probability of survival (survived = 1).
# Generate fitted values and compare them for different
# models.

In [171]:
titanic = pd.read_csv("data/titanic.csv")

In [172]:
titanic['survived'].head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: int64

In [173]:
titanic.head()

Unnamed: 0,passenger_id,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,1,"Allen, Miss. Elisabeth Walton",1,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,2,1,1,"Allison, Master. Hudson Trevor",0,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,3,1,0,"Allison, Miss. Helen Loraine",1,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,4,1,0,"Allison, Mr. Hudson Joshua Creighton",0,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [175]:
levCols = []
numCols = []
for col in titanic.columns:
    if titanic[col].dtype==object:
        levCols.append(col)
    else:
        numCols.append(col)

levCols.remove("name")
levCols.remove("home.dest")
numCols.remove("passenger_id")

In [176]:
levCols

['ticket', 'cabin', 'embarked', 'boat']

In [177]:
numCols

['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'body']

In [178]:
dummLev = pd.get_dummies(titanic[levCols])
dummLev.shape

(1309, 1145)

In [179]:
titanic = pd.concat([titanic[numCols], dummLev], axis=1)

In [180]:
x = titanic.isna().sum()
x.loc[titanic.isna().sum() > 0]

age      263
fare       1
body    1188
dtype: int64

In [181]:
titanic['body'].fillna(titanic['body'].mean(), inplace=True)

In [182]:
x = titanic.isna().sum()
x.loc[titanic.isna().sum() > 0]

age     263
fare      1
dtype: int64

In [183]:
titanic.dropna()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,body,ticket_110152,ticket_110413,...,boat_6,boat_7,boat_8,boat_8 10,boat_9,boat_A,boat_B,boat_C,boat_C D,boat_D
0,1,1,1,29.0000,0,0,211.3375,160.809917,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0.9167,1,2,151.5500,160.809917,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,2.0000,1,2,151.5500,160.809917,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,30.0000,1,2,151.5500,135.000000,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,25.0000,1,2,151.5500,160.809917,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,1,0,48.0000,0,0,26.5500,160.809917,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,1,1,63.0000,1,0,77.9583,160.809917,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,39.0000,0,0,0.0000,160.809917,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1,1,53.0000,2,0,51.4792,160.809917,0,0,...,0,0,0,0,0,0,0,0,0,1
9,1,0,0,71.0000,0,0,49.5042,22.000000,0,0,...,0,0,0,0,0,0,0,0,0,0


In [184]:
features = titanic.columns.tolist()
features.remove("survived")

In [185]:
titanic = titanic.sample(n=100)

In [186]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
## priors as average values if they are not defined
est = lda.fit(titanic[features], titanic["survived"])
preds = est.predict(titanic[features])
probs = est.predict_proba(titanic[features])
sum(preds == titanic["survived"])/len(preds)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Exercise 3.2.
# Wine Quality Data Set: "data/wines.csv"
# source: https://archive.ics.uci.edu/ml/datasets/wine+quality
# The file contains data on samples of white and red Portuguese wine 
# Vinho Verde. 
# Various physico-chemical characteristics of individual samples
# are available as well as wine quality scores on a point scale (0-10) 
# made by specialists.

# Perform linear and quadratic discriminant analysis 
# to model the quality of wine (variable quality),
# treating the explained variable as qualitative.
# Generate fitted values and compare them for different models.