In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, ElasticNet
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer

In [2]:
ds = pd.read_csv('data/train.csv')
kaggle = pd.read_csv('data/test.csv')

## setando index
ds = ds.set_index('id')
kaggle = kaggle.set_index('id')

In [3]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33940 entries, 20000 to 53939
Data columns (total 10 columns):
carat      33940 non-null float64
cut        33940 non-null object
color      33940 non-null object
clarity    33940 non-null object
x          33940 non-null float64
y          33940 non-null float64
z          33940 non-null float64
depth      33940 non-null float64
table      33940 non-null float64
price      33940 non-null int64
dtypes: float64(6), int64(1), object(3)
memory usage: 2.8+ MB


In [4]:
ds.head()

Unnamed: 0_level_0,carat,cut,color,clarity,x,y,z,depth,table,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20000,0.35,Very Good,G,VVS2,4.44,4.48,2.8,62.8,58.0,798
20001,0.7,Ideal,F,SI1,5.66,5.69,3.55,62.6,56.0,2089
20002,0.32,Ideal,F,VVS1,4.42,4.38,2.7,61.4,56.0,990
20003,0.3,Ideal,H,VVS2,4.32,4.35,2.67,61.7,54.2,631
20004,0.33,Premium,I,VVS2,4.41,4.47,2.76,62.2,59.0,579


In [5]:
ds.describe()

Unnamed: 0,carat,x,y,z,depth,table,price
count,33940.0,33940.0,33940.0,33940.0,33940.0,33940.0,33940.0
mean,0.796249,5.727926,5.730563,3.535916,61.746491,57.467664,3920.022864
std,0.472866,1.119282,1.120279,0.693763,1.42557,2.237116,3980.229999
min,0.2,0.0,0.0,0.0,43.0,44.0,326.0
25%,0.4,4.71,4.72,2.91,61.0,56.0,952.0
50%,0.7,5.7,5.71,3.52,61.8,57.0,2395.0
75%,1.04,6.54,6.53,4.03,62.5,59.0,5294.0
max,5.01,10.74,31.8,6.98,79.0,95.0,18823.0


In [6]:
ds.isnull().sum()

carat      0
cut        0
color      0
clarity    0
x          0
y          0
z          0
depth      0
table      0
price      0
dtype: int64

In [7]:
# removendo quando x,y,z = 0
ds = ds[(ds[['x','y','z']] != 0).all(axis = 1)]

In [8]:
print((ds.x == 0).sum())
print((ds.y == 0).sum())
print((ds.z == 0).sum())
print((ds.depth == 0).sum())

0
0
0
0


In [9]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33929 entries, 20000 to 53939
Data columns (total 10 columns):
carat      33929 non-null float64
cut        33929 non-null object
color      33929 non-null object
clarity    33929 non-null object
x          33929 non-null float64
y          33929 non-null float64
z          33929 non-null float64
depth      33929 non-null float64
table      33929 non-null float64
price      33929 non-null int64
dtypes: float64(6), int64(1), object(3)
memory usage: 2.8+ MB


In [10]:
# colocando valores numericos para os dados em texto
lb = LabelEncoder()

ds['cut'] = lb.fit_transform(ds['cut'])
ds['color'] = lb.fit_transform(ds['color'])
ds['clarity'] = lb.fit_transform(ds['clarity'])

kaggle['cut'] = lb.fit_transform(kaggle['cut'])
kaggle['color'] = lb.fit_transform(kaggle['color'])
kaggle['clarity'] = lb.fit_transform(kaggle['clarity'])

# ds =  pd.get_dummies(ds)
# kaggle =  pd.get_dummies(kaggle)

# ds['volume'] = ds['x']*ds['y']*ds['z']
# kaggle['volume'] = kaggle['x']*kaggle['y']*kaggle['z']

In [11]:
# cols = ds.columns
# is_numeric = pd.api.types.is_numeric_dtype

# # plotando para valores numericos
# for col in cols:
#     if(is_numeric(ds[col]) and col != 'price'):
#         ds.plot.scatter(x = col, y = 'price')

In [12]:
# ds = ds.drop(['x','y','z'], axis=1)
# kaggle = kaggle.drop(['x','y','z'], axis=1)

# ds = ds.drop(['volume','cut'], axis=1)
# kaggle = kaggle.drop(['volume','cut'], axis=1)

ds.head()

Unnamed: 0_level_0,carat,cut,color,clarity,x,y,z,depth,table,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20000,0.35,4,3,7,4.44,4.48,2.8,62.8,58.0,798
20001,0.7,2,2,2,5.66,5.69,3.55,62.6,56.0,2089
20002,0.32,2,2,6,4.42,4.38,2.7,61.4,56.0,990
20003,0.3,2,4,7,4.32,4.35,2.67,61.7,54.2,631
20004,0.33,3,5,7,4.41,4.47,2.76,62.2,59.0,579


In [13]:
x = ds.drop(['price'], axis=1)
y = ds['price']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 7)

# X_train = ds.drop(['price'], axis=1)
# y_train = ds['price']

In [14]:
def rmspe_score(y_test, y_pred):

    return np.sqrt(np.mean(np.square(((y_test - y_pred) / y_test)), axis = 0))

In [15]:
lr = LinearRegression()
lr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = lr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = lr.predict(X_test)

print('')
print('####### Linear Regression #######')
print('Score : %.4f' % lr.score(X_test, y_test))
print(accuracies)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
# rmspe = np.sqrt(np.mean(np.square(((y_test - y_pred) / y_test)), axis=0))
rmspe = rmspe_score(y_test, y_pred)

print('')
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)
print('RMSPE  : %0.4f ' % rmspe)

In [16]:
rf = RandomForestRegressor(n_estimators = 1000, max_features = 9, random_state = 7)
rf.fit(X_train , y_train)
accuracies = cross_val_score(estimator = lr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = rf.predict(X_test)

print('')
print('####### Random Forest Regression #######')
print('Score : %.4f' % rf.score(X_test, y_test))
print(accuracies)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
r2 = r2_score(y_test, y_pred)
rmspe = rmspe_score(y_test, y_pred)

print('')
print('MAE    : %0.2f ' % mae)
print('RMSE   : %0.2f ' % rmse)
print('R2     : %0.4f ' % r2)
print('RMSPE  : %0.4f ' % rmspe)

In [17]:
## procurando os melhores parametros para o random forest
# param_grid = [
# {'n_estimators': [100], 'max_features': [8, 9]}
# ]

# rf = RandomForestRegressor(random_state = 7)

# gs = GridSearchCV(rf, param_grid, cv = 5, scoring = make_scorer(rmspe_score))
# gs.fit(X_train, y_train)
# gs.best_params_

In [18]:
# et = ExtraTreesRegressor(n_estimators = 100)
# et.fit(X_train , y_train)
# accuracies = cross_val_score(estimator = et, X = X_train, y = y_train, cv = 5,verbose = 1)
# y_pred = et.predict(X_test)

# print('')
# print('####### Extra Trees Regression #######')
# print('Score : %.4f' % et.score(X_test, y_test))
# print(accuracies)

# mae = mean_absolute_error(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred)**0.5
# r2 = r2_score(y_test, y_pred)
# rmspe = rmspe_score(y_test, y_pred)

# print('')
# print('MAE    : %0.2f ' % mae)
# print('RMSE   : %0.2f ' % rmse)
# print('R2     : %0.4f ' % r2)
# print('RMSPE  : %0.4f ' % rmspe)

In [19]:
rf = RandomForestRegressor(n_estimators = 1000, max_features = 9, random_state = 7)
rf.fit(X_train , y_train)
y_kaggle = rf.predict(kaggle)
submission = pd.DataFrame({'id':kaggle.index, 'price':y_kaggle})
submission.to_csv('submission.csv', index = False)