In [34]:
import pandas as pd
from math import sqrt
from sklearn import datasets, svm, metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler , StandardScaler, Imputer, LabelEncoder


In [35]:
diamonds =  pd.read_csv('./diamonds-datamad1019/data.csv/data.csv')

In [36]:
color = {
    "D": 7,
    "E": 6,
    "F": 5,
    "G": 4,
    "H": 3,
    "I": 2,
    "J": 1
}

cut = {
    "Ideal": 5,
    "Premium": 4,
    "Very Good": 3,
    "Good": 2,
    "Fair":1
}

clarity = {
    "IF": 8,
    "VVS1": 7,
    "VVS2": 6,
    "VS1": 5,
    "VS2": 4,
    "SI1": 3,
    "SI2": 2,
    "I1": 1
}

In [37]:
for i in range(len(diamonds)):
    co = color[diamonds.loc[i, "color"]]
    cu = cut[diamonds.loc[i, "cut"]]
    cl = clarity[diamonds.loc[i, "clarity"]]
    diamonds.loc[i, "color_num"] = co
    diamonds.loc[i, "cut_num"] = cu
    diamonds.loc[i, "clarity_num"] = cl

In [33]:
diamonds

Unnamed: 0,carat,depth,table,price,color_num,cut_num,clarity_num,volume
0,2.26,61.9,57.0,12831,4.0,5.0,2.0,366.903680
1,2.43,63.2,57.0,16170,3.0,3.0,2.0,392.176400
2,0.80,61.0,57.0,2797,5.0,4.0,2.0,133.001901
3,0.40,63.3,60.0,630,5.0,5.0,1.0,64.059840
4,0.31,61.6,55.0,698,4.0,5.0,4.0,51.797610
...,...,...,...,...,...,...,...,...
40450,1.11,62.8,61.0,5315,3.0,4.0,3.0,180.060192
40451,0.73,62.6,56.0,2762,5.0,5.0,4.0,119.231280
40452,1.26,59.2,60.0,6855,2.0,3.0,5.0,208.046124
40453,0.72,61.4,56.0,2297,4.0,5.0,2.0,119.547648


In [112]:
label_cut = LabelEncoder()
label_color = LabelEncoder()
label_clarity = LabelEncoder()


diamonds['cut'] = label_cut.fit_transform(diamonds['cut'])
diamonds['color'] = label_color.fit_transform(diamonds['color'])
diamonds['clarity'] = label_clarity.fit_transform(diamonds['clarity'])

In [113]:
#diamonds = diamonds[(diamonds[['x','y','z']] != 0).all(axis=1)]

In [6]:
diamonds.drop(columns=['cut','color','clarity'], inplace=True)

In [7]:
diamonds['volume'] = diamonds['x']*diamonds['y']*diamonds['z']
diamonds.drop(['x','y','z'],axis=1,inplace=True)

In [8]:
diamonds.head()

Unnamed: 0,carat,depth,table,price,color_num,cut_num,clarity_num,volume
0,2.26,61.9,57.0,12831,4.0,5.0,2.0,366.90368
1,2.43,63.2,57.0,16170,3.0,3.0,2.0,392.1764
2,0.8,61.0,57.0,2797,5.0,4.0,2.0,133.001901
3,0.4,63.3,60.0,630,5.0,5.0,1.0,64.05984
4,0.31,61.6,55.0,698,4.0,5.0,4.0,51.79761


In [17]:
X = diamonds[["carat","color_num","cut_num","clarity_num"]]
#X = diamonds[["carat","color","cut","clarity"]]

In [18]:
y = diamonds.price

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [24]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [28]:
models = {
    "KNN": KNeighborsRegressor(n_neighbors=5),
    #"svm": LinearSVC(),
    #"logistic": LogisticRegression(solver='lbfgs', max_iter=2000),
    "forest": RandomForestRegressor()
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)

Training model: KNN
Training model: forest




In [31]:
d = {modelName:model.predict(X_test) for modelName, model in models.items()}

df = pd.DataFrame(d)
y_test.reset_index(inplace=True, drop=True)
df["gt"] = y_test
df.forest = df.forest.astype(int)
df.KNN = df.KNN.astype(int)
df

Unnamed: 0,KNN,forest,gt
0,571,573,614
1,8678,8228,8373
2,1078,984,923
3,848,768,942
4,4475,4563,4089
...,...,...,...
8086,508,537,470
8087,871,912,719
8088,1337,1303,1413
8089,499,492,530


In [32]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from IPython.core.display import display, HTML

switchColor = lambda value: f"<b style=\"color:orange\">{round(value,2)}</b>" if value < 0.8 else f"<b style=\"color:green\">{round(value,2)}</b>"
printBonito = lambda label,val: display(HTML(f"<span style=\"padding-left:20px\">The {label} is: {switchColor(val)}</span>"))

for modelName, model in models.items():    
    print(f"Evaluating model [{modelName}]:")
    printBonito("Accuracy", accuracy_score(df["gt"],df[modelName]))
    printBonito("Precision", precision_score(df["gt"],df[modelName],average='weighted'))
    printBonito("Recall", recall_score(df["gt"],df[modelName],average='weighted'))

Evaluating model [KNN]:


  'precision', 'predicted', average, warn_for)


  'recall', 'true', average, warn_for)


Evaluating model [forest]:


In [151]:
diamonds_test =  pd.read_csv('./diamonds-datamad1019/test.csv')

In [132]:
label_cut = LabelEncoder()
label_color = LabelEncoder()
label_clarity = LabelEncoder()


diamonds_test['cut'] = label_cut.fit_transform(diamonds_test['cut'])
diamonds_test['color'] = label_color.fit_transform(diamonds_test['color'])
diamonds_test['clarity'] = label_clarity.fit_transform(diamonds_test['clarity'])

In [152]:
for i in range(len(diamonds_test)):
    co = color[diamonds_test.loc[i, "color"]]
    cu = cut[diamonds_test.loc[i, "cut"]]
    cl = clarity[diamonds_test.loc[i, "clarity"]]
    diamonds_test.loc[i, "color_num"] = co
    diamonds_test.loc[i, "cut_num"] = cu
    diamonds_test.loc[i, "clarity_num"] = cl

In [154]:
diamonds_test.drop(columns=['cut','color','clarity'], inplace=True)

In [155]:
#diamonds_test = diamonds_test[(diamonds_test[['x','y','z']] != 0).all(axis=1)]

In [156]:
X = diamonds_test[["carat","color_num","cut_num","clarity_num"]]

In [157]:
d = {modelName:model.predict(X) for modelName, model in models.items()}
print(d)

df = pd.DataFrame(d)


{'forest': array([15813, 17590, 17590, ..., 17590,  8979, 16469])}


In [158]:
#df.drop(columns=['KNN'], inplace=True)
df.rename(columns={"forest": "price"}, inplace=True)
df

Unnamed: 0,price
0,15813
1,17590
2,17590
3,17590
4,15813
...,...
13480,17590
13481,15813
13482,17590
13483,8979


In [159]:
df.to_csv('./diamonds-datamad1019/submission.csv')