In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
%matplotlib inline

In [7]:
df = pd.read_csv('credit.csv', index_col=0)
df = df[df.Limit > 3500]
df['Student'] = df.Student.map({'No':0, 'Yes':1})
df['Married'] = df.Married.map({'No':0, 'Yes':1})
df['Ethnicity'] = df.Ethnicity.map({'Caucasian':0, 'Asian':1, 'African American':2})
df['Female'] = pd.get_dummies(df.Gender)['Female']
df.drop(['Gender'], axis=1, inplace=True)#inplace copy within the dataframe
df.head(3)

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Student,Married,Ethnicity,Balance,Female
1,14.891,3606,283,2,34,11,0,1,0,333,0
2,106.025,6645,483,3,82,15,1,1,1,903,1
3,104.593,7075,514,4,71,11,0,0,1,580,0


In [8]:
y = df ['Balance']
X = df[['Income','Limit', 'Rating']].astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [10]:
model=linear_model.LinearRegression(fit_intercept=True).fit(X_train, y_train) #fit_intercept=True adds the y intercept (B0)

In [11]:
model.score(X_test, y_test)

0.79540891826399185

In [19]:
y = df ['Balance']
X = df[['Income','Limit', 'Rating', 'Student']].astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [20]:
model=linear_model.LinearRegression(fit_intercept=True).fit(X_train, y_train)

In [21]:
model.score(X_test, y_test)

0.99024516911113569

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=df['Ethnicity'])

In [25]:
model1=linear_model.LinearRegression(fit_intercept=True).fit(X_train, y_train)

In [26]:
model1.score(X_test, y_test)

0.98801558809432

In [27]:
y = df ['Balance']
X = df[['Income','Limit', 'Rating', 'Student']].astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [28]:
model=linear_model.LinearRegression(fit_intercept=True).fit(X_train, y_train)

In [29]:
model.score(X_test, y_test)

0.99024516911113569

In [30]:
model.coef_

array([ -9.87584036e+00,   2.16173015e-01,   1.59330687e+00,
         4.92253327e+02])

In [31]:
model.intercept_

-724.69405340113929

In [34]:
from itertools import combinations

In [35]:
list(combinations(['Rating', 'Income', 'Limit', 'Student'], 2))

[('Rating', 'Income'),
 ('Rating', 'Limit'),
 ('Rating', 'Student'),
 ('Income', 'Limit'),
 ('Income', 'Student'),
 ('Limit', 'Student')]

In [36]:
for i in range (1,5):
    print(list(combinations(['Rating', 'Income', 'Limit', 'Student'], i)))

[('Rating',), ('Income',), ('Limit',), ('Student',)]
[('Rating', 'Income'), ('Rating', 'Limit'), ('Rating', 'Student'), ('Income', 'Limit'), ('Income', 'Student'), ('Limit', 'Student')]
[('Rating', 'Income', 'Limit'), ('Rating', 'Income', 'Student'), ('Rating', 'Limit', 'Student'), ('Income', 'Limit', 'Student')]
[('Rating', 'Income', 'Limit', 'Student')]


In [37]:
df.columns

Index(['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education', 'Student',
       'Married', 'Ethnicity', 'Balance', 'Female'],
      dtype='object')

In [42]:
for i in range (1,10):
    print(list(combinations(['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education', 'Student',
       'Married', 'Ethnicity', 'Female'], i)))


[('Income',), ('Limit',), ('Rating',), ('Cards',), ('Age',), ('Education',), ('Student',), ('Married',), ('Ethnicity',), ('Female',)]
[('Income', 'Limit'), ('Income', 'Rating'), ('Income', 'Cards'), ('Income', 'Age'), ('Income', 'Education'), ('Income', 'Student'), ('Income', 'Married'), ('Income', 'Ethnicity'), ('Income', 'Female'), ('Limit', 'Rating'), ('Limit', 'Cards'), ('Limit', 'Age'), ('Limit', 'Education'), ('Limit', 'Student'), ('Limit', 'Married'), ('Limit', 'Ethnicity'), ('Limit', 'Female'), ('Rating', 'Cards'), ('Rating', 'Age'), ('Rating', 'Education'), ('Rating', 'Student'), ('Rating', 'Married'), ('Rating', 'Ethnicity'), ('Rating', 'Female'), ('Cards', 'Age'), ('Cards', 'Education'), ('Cards', 'Student'), ('Cards', 'Married'), ('Cards', 'Ethnicity'), ('Cards', 'Female'), ('Age', 'Education'), ('Age', 'Student'), ('Age', 'Married'), ('Age', 'Ethnicity'), ('Age', 'Female'), ('Education', 'Student'), ('Education', 'Married'), ('Education', 'Ethnicity'), ('Education', 'Femal

In [40]:
total = 0
for i in range(1,11):
   total += len(list(combinations(['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education', 'Student',
      'Married', 'Ethnicity', 'Female'],i)))


In [41]:
total

1023

In [71]:
rows = []
for i in range(1,11):
    combos = list(combinations(['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education', 'Student','Married', 'Ethnicity', 'Female'],i))
    for j,com in enumerate(combos):
        y = df.Balance
        X = pd.DataFrame(df, columns=com)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
        model = linear_model.LinearRegression(fit_intercept=True).fit(X_train, y_train)
        score = model.score(X_test, y_test)
        s = ', '.join(com)
        rows.append({'Score':score, 'Columns':s, 'Coef':model.coef_,'Int':model.intercept_})
        # print('score:', score, 'columns:', s)
df1 = pd.DataFrame(rows)
df1.head()      


Unnamed: 0,Coef,Columns,Int,Score
0,[3.10140736773],Income,564.006986,0.086668
1,[0.154033096807],Limit,-171.223522,0.537913
2,[2.29397576729],Rating,-254.87931,0.527959
3,[50.7089108911],Cards,584.29604,-0.023474
4,[-0.434801576922],Age,758.219638,-0.010028


In [79]:
pd.options.display.max_colwidth = 100
df1.sort_values('Score', ascending=False) 

Unnamed: 0,Coef,Columns,Int,Score
882,"[-9.7063583428, 0.320926357688, 25.5767496149, -1.18837484182, 0.217753823979, 500.295362585, -1...","Income, Limit, Cards, Age, Education, Student, Married",-677.117957,0.998961
676,"[-9.70518457332, 0.320909163994, 25.5483060928, -1.18528464719, 500.59506012, -1.44970646279]","Income, Limit, Cards, Age, Student, Married",-674.321308,0.998947
672,"[-9.7082990305, 0.320956127765, 25.5850536411, -1.19047561186, 0.203065667882, 500.384446577]","Income, Limit, Cards, Age, Education, Student",-677.852116,0.998943
989,"[-9.70465624678, 0.32087498393, 25.4675677853, -1.18937032044, 0.24595556815, 499.502745872, -1....","Income, Limit, Cards, Age, Education, Student, Married, Female",-678.824424,0.998938
407,"[-9.70709134608, 0.320938362119, 25.5579563701, -1.18746457062, 500.660072753]","Income, Limit, Cards, Age, Student",-675.192443,0.998932
889,"[-9.70336182283, 0.320856470113, 25.4373496356, -1.18586983937, 499.854008612, -1.84590304594, 4...","Income, Limit, Cards, Age, Student, Married, Female",-675.642836,0.998925
884,"[-9.70717137802, 0.320914683065, 25.4828355887, -1.19196742796, 0.226242321624, 499.649792014, 4...","Income, Limit, Cards, Age, Education, Student, Female",-679.671966,0.998918
967,"[-9.70424173348, 0.313649689062, 0.107727242365, 25.0251595478, -1.18871839639, 0.228262472766, ...","Income, Limit, Rating, Cards, Age, Education, Student, Married",-679.062355,0.998914
678,"[-9.70584348383, 0.320895495313, 25.4541125453, -1.18859673179, 499.966671725, 3.95379118754]","Income, Limit, Cards, Age, Student, Female",-676.687613,0.998908
988,"[-9.70273886738, 0.320898746553, 25.5868336582, -1.18646870113, 0.197474994074, 500.754141029, -...","Income, Limit, Cards, Age, Education, Student, Married, Ethnicity",-676.126949,0.998906


In [80]:
df1.iloc[df1.Score.idxmax()] 

Coef       [-9.7063583428, 0.320926357688, 25.5767496149, -1.18837484182, 0.217753823979, 500.295362585, -1...
Columns                                                 Income, Limit, Cards, Age, Education, Student, Married
Int                                                                                                   -677.118
Score                                                                                                 0.998961
Name: 882, dtype: object