In [74]:
# https://www.gia.edu/diamond-quality-factor
import pandas as pd
from sklearn import datasets, svm, metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [75]:
diamonds =  pd.read_csv('./diamonds-datamad1019/data.csv/data.csv')

In [76]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,2.26,Ideal,G,SI2,61.9,57.0,8.44,8.36,5.2,12831
1,2.43,Very Good,H,SI2,63.2,57.0,8.56,8.5,5.39,16170
2,0.8,Premium,F,SI2,61.0,57.0,6.03,6.01,3.67,2797
3,0.4,Ideal,F,I1,63.3,60.0,4.68,4.64,2.95,630
4,0.31,Ideal,G,VS2,61.6,55.0,4.39,4.37,2.7,698


In [77]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 10 columns):
carat      40455 non-null float64
cut        40455 non-null object
color      40455 non-null object
clarity    40455 non-null object
depth      40455 non-null float64
table      40455 non-null float64
x          40455 non-null float64
y          40455 non-null float64
z          40455 non-null float64
price      40455 non-null int64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.1+ MB


In [78]:
diamonds = pd.get_dummies(diamonds,columns=["cut"],prefix="",prefix_sep="")

In [79]:
diamonds.head()

Unnamed: 0,carat,color,clarity,depth,table,x,y,z,price,Fair,Good,Ideal,Premium,Very Good
0,2.26,G,SI2,61.9,57.0,8.44,8.36,5.2,12831,0,0,1,0,0
1,2.43,H,SI2,63.2,57.0,8.56,8.5,5.39,16170,0,0,0,0,1
2,0.8,F,SI2,61.0,57.0,6.03,6.01,3.67,2797,0,0,0,1,0
3,0.4,F,I1,63.3,60.0,4.68,4.64,2.95,630,0,0,1,0,0
4,0.31,G,VS2,61.6,55.0,4.39,4.37,2.7,698,0,0,1,0,0


In [80]:
diamonds = pd.get_dummies(diamonds,columns=["color"])

In [81]:
diamonds.head()

Unnamed: 0,carat,clarity,depth,table,x,y,z,price,Fair,Good,Ideal,Premium,Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,2.26,SI2,61.9,57.0,8.44,8.36,5.2,12831,0,0,1,0,0,0,0,0,1,0,0,0
1,2.43,SI2,63.2,57.0,8.56,8.5,5.39,16170,0,0,0,0,1,0,0,0,0,1,0,0
2,0.8,SI2,61.0,57.0,6.03,6.01,3.67,2797,0,0,0,1,0,0,0,1,0,0,0,0
3,0.4,I1,63.3,60.0,4.68,4.64,2.95,630,0,0,1,0,0,0,0,1,0,0,0,0
4,0.31,VS2,61.6,55.0,4.39,4.37,2.7,698,0,0,1,0,0,0,0,0,1,0,0,0


In [82]:
diamonds = pd.get_dummies(diamonds,columns=["clarity"])

In [83]:
diamonds.head()

Unnamed: 0,carat,depth,table,x,y,z,price,Fair,Good,Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,2.26,61.9,57.0,8.44,8.36,5.2,12831,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,2.43,63.2,57.0,8.56,8.5,5.39,16170,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.8,61.0,57.0,6.03,6.01,3.67,2797,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.4,63.3,60.0,4.68,4.64,2.95,630,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,0.31,61.6,55.0,4.39,4.37,2.7,698,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [84]:
X = diamonds[["carat","Fair","Good","Ideal","Premium","Very Good","color_D","color_E","color_F","color_G","color_H","color_I","color_J","clarity_I1","clarity_IF","clarity_SI1","clarity_SI2","clarity_VS1","clarity_VS2","clarity_VVS1","clarity_VVS2"]]

In [85]:
y = diamonds.price

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.01)

In [87]:
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    #"svm": LinearSVC(),
    #"logistic": LogisticRegression(solver='lbfgs', max_iter=2000),
    "forest": RandomForestClassifier()
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X_train, y_train)

Training model: KNN
Training model: forest




In [89]:
d = {modelName:model.predict(X_test) for modelName, model in models.items()}

df = pd.DataFrame(d)
y_test.reset_index(inplace=True, drop=True)
df["gt"] = y_test
df

Unnamed: 0,KNN,forest,gt
0,8941,9158,10020
1,11560,11560,11190
2,895,895,835
3,2111,2111,2274
4,2590,3409,2476
...,...,...,...
400,537,689,537
401,983,983,1095
402,7092,7673,7276
403,3780,3950,3733


In [90]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from IPython.core.display import display, HTML

switchColor = lambda value: f"<b style=\"color:orange\">{round(value,2)}</b>" if value < 0.8 else f"<b style=\"color:green\">{round(value,2)}</b>"
printBonito = lambda label,val: display(HTML(f"<span style=\"padding-left:20px\">The {label} is: {switchColor(val)}</span>"))

for modelName, model in models.items():    
    print(f"Evaluating model [{modelName}]:")
    printBonito("Accuracy", accuracy_score(df["gt"],df[modelName]))
    printBonito("Precision", precision_score(df["gt"],df[modelName],average='weighted'))
    printBonito("Recall", recall_score(df["gt"],df[modelName],average='weighted'))

Evaluating model [KNN]:


  'precision', 'predicted', average, warn_for)


  'recall', 'true', average, warn_for)


Evaluating model [forest]:


In [91]:
diamonds_test =  pd.read_csv('./diamonds-datamad1019/test.csv')

In [92]:
diamonds_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.56,Ideal,I,VS2,62.1,54.0,5.3,5.33,3.3
1,1,1.05,Ideal,G,VS2,61.9,56.0,6.56,6.52,4.05
2,2,0.5,Premium,E,VS2,61.5,56.0,5.11,5.07,3.13
3,3,0.8,Ideal,F,VS1,62.8,56.0,5.91,5.96,3.73
4,4,0.54,Ideal,G,VS1,61.4,57.0,5.23,5.2,3.2


In [93]:
diamonds_test = pd.get_dummies(diamonds_test,columns=["cut"],prefix="",prefix_sep="")

In [94]:
diamonds_test = pd.get_dummies(diamonds_test,columns=["color"])

In [95]:
diamonds_test = pd.get_dummies(diamonds_test,columns=["clarity"])

In [96]:
X = diamonds_test[["carat","Fair","Good","Ideal","Premium","Very Good","color_D","color_E","color_F","color_G","color_H","color_I","color_J","clarity_I1","clarity_IF","clarity_SI1","clarity_SI2","clarity_VS1","clarity_VS2","clarity_VVS1","clarity_VVS2"]]

In [97]:
d = {modelName:model.predict(X) for modelName, model in models.items()}
print(d)

df = pd.DataFrame(d)


{'KNN': array([1216, 6604, 1746, ..., 8529,  506, 1243]), 'forest': array([ 1417,  6604,  1746, ..., 11230,   506,  1273])}


In [98]:
df.drop(columns=['KNN'], inplace=True)
df.rename(columns={"forest": "price"}, inplace=True)
df

Unnamed: 0,price
0,1417
1,6604
2,1746
3,3884
4,1847
...,...
13480,2331
13481,2129
13482,11230
13483,506


In [99]:
df.to_csv('./diamonds-datamad1019/submission.csv')

## Segunda prueba