## Data Analysis with Diamonds

In [1]:
import pandas as pd
import sklearn
from sklearn.linear_model import SGDRegressor
from sklearn import svm

print('done')

done


In [2]:
df = pd.read_csv('diamonds.csv', index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## Convert non numerical values 

In [3]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [4]:
# one way to do this is with cat codes but we need to preserve a pre known higheracry so we will use somthing else
# df['cut'].astype('category').cat.codes

cut_class_dict = {'Fair':1, 'Good':2, 'Very Good':3, 'Premium':4, 'Ideal':5}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

### Now we need to map these values

In [5]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


## Next we shuffle these values 

In [6]:
# always shuffle your data to avoid any biases that may emerge b/c of some order.
df = sklearn.utils.shuffle(df)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
343,0.71,3,6,6,63.8,58.0,2804,5.62,5.66,3.6
40545,0.55,3,2,5,63.6,57.0,1149,5.15,5.19,3.29
50301,0.27,3,5,9,59.7,59.0,544,4.19,4.22,2.51
43181,0.52,5,3,6,62.2,56.4,1391,5.09,5.16,3.19
37801,0.51,5,2,5,61.9,54.0,1000,5.12,5.15,3.18


### We will also drop the price to make sure that it dose not have that data

Recall that many methods will return a dataframe. So for X we want all of the columns EXCEPT for the price one, so we can just drop it. Then we use .values to convert to a numpy array. Then, for our labels, y, we say this is just the price column's values

In [7]:
X = df.drop("price", axis=1).values
y = df["price"].values

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
343,0.71,3,6,6,63.8,58.0,2804,5.62,5.66,3.6
40545,0.55,3,2,5,63.6,57.0,1149,5.15,5.19,3.29
50301,0.27,3,5,9,59.7,59.0,544,4.19,4.22,2.51
43181,0.52,5,3,6,62.2,56.4,1391,5.09,5.16,3.19
37801,0.51,5,2,5,61.9,54.0,1000,5.12,5.15,3.18


## Split values for testing

We want to probably split and save some of these values for testing the model after it's been trained. So we'll do something like:

In [8]:
test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

y_test

array([  621,  2889,  1640,   772, 11190,  4320,  5382, 11206,   716,
         886,  5265,  6908,  4078,   625,  6999,   972,  2822,   886,
         625,  1752,  2947,  6281, 16941,   778, 17492,  1289,   625,
        3536,  4909,  9176, 13095,  1343,  6292,  2088,  9474,  1613,
        5141,  1758,  2853,  1024,  9173,   921,  9530,  3830,  1092,
       12654,  1080,  6098, 12308,  8736, 15575,  2327,  4347,  2078,
       13477,  1712,   683,  9681,  4291, 15105,  1815,  1016,   844,
        3215,  2163,   814,  1813, 14768,  1936,  1436,  6787,   792,
        1132,  3838, 13629,  1675,  1621, 15308,   687,  8752,  2263,
        1415,  9333,  4372,  3231,  3724, 10508,  1662, 10869,  4616,
        5902,   767, 10071, 10161,   755,  4586,  2872,   442,  6353,
       14680,  1089,   816,   561, 12170,  1154,  2142,   709,   475,
        1849, 15802, 15930,   473,  6098,   645,   625,  3593,  2861,
         942,  1960,   990,  3206,  4455,   835,  5055,   872,   855,
        2599,  1015,

## Now we want to train the model

In [9]:
# set up the model 
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

### now lets check out our current model score 

In [10]:
print(clf.score(X_test, y_test))

-122555925.37483901


### we then look at the actual prediction values 

In [11]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

12538490.452625751 621
14809061.717327118 2889
-10328077.384397507 1640
34390301.02269459 772
9970008.495135307 11190
33374377.18662119 4320
23241604.113889694 5382
-42134625.6554265 11206
-13726324.440168858 716
24794500.457984924 886


### Results
Well, that's not very good. The score for these regression models is r-squared/coefficient of determination, so I am actually not even sure how we got -70999348.67836547, but apparently we did. R-Squared is more often between 0 and 100%, where 100% is a perfect fit (1.0). Let's try support vector regression instead:

In [13]:
clf = svm.SVR(kernel = 'linear')

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.8214178865581102


In [14]:
for X,y in list(zip(X_test, y_test))[:20]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")

model predicts -57.70413925314824, real value: 621
model predicts 3440.349640774677, real value: 2889
model predicts 1987.0987409995141, real value: 1640
model predicts 670.2613760289405, real value: 772
model predicts 10045.212686385816, real value: 11190
model predicts 3971.869918691127, real value: 4320
model predicts 5685.796701435216, real value: 5382
model predicts 8078.358552975917, real value: 11206
model predicts 658.7389351081783, real value: 716
model predicts 1183.9861732721038, real value: 886
model predicts 6117.2906105654165, real value: 5265
model predicts 5592.126946772254, real value: 6908
model predicts 4800.211617934468, real value: 4078
model predicts 152.05984682459348, real value: 625
model predicts 5705.3228395102, real value: 6999
model predicts 353.89889734903227, real value: 972
model predicts 3257.5260410553583, real value: 2822
model predicts 1000.8761885445292, real value: 886
model predicts -210.93714901554995, real value: 625
model predicts 1911.25502999

## Final Results 
here we can see the SVM model dose a much better job