In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

In [None]:
diamonds = pd.read_csv('/home/markaw/Downloads/diamonds_train.csv')
predict = pd.read_csv('/home/markaw/Downloads/diamonds_test.csv')

In [None]:
TARGET = 'price'
CAT_FEATURES = ['cut','color','clarity']
NUM_FEATURES = ['carat','depth','table','x','y','z']

FEATURES = CAT_FEATURES + NUM_FEATURES

# for i in CAT_FEATURES:
#     diamonds[i] = diamonds[i].astype('category')
#     predict[i] = predict[i].astype('category')

In [None]:
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                       ('scaler', RobustScaler())])

numerical_transformer.fit_transform(diamonds[NUM_FEATURES])


In [None]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value = 'missing')),                                                                
                                         ('ecoder', OrdinalEncoder(handle_unknown='ignore'))])


categorical_transformer.fit_transform(diamonds[CAT_FEATURES])


In [None]:
preprocessor = ColumnTransformer(transformers=[('numerical_preprocessor', numerical_transformer, NUM_FEATURES),
                                              ('categorical_preprocessor', categorical_transformer, CAT_FEATURES)])

pd.DataFrame(data=preprocessor.fit_transform(diamonds[FEATURES]))

In [None]:
train, test = train_test_split(diamonds)

model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', LGBMRegressor(n_jobs=-1))])

X = diamonds[FEATURES]
y = diamonds[TARGET]

In [None]:
model.fit(X, y);

In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameter_grid = {'regressor__max_depth' : [2,4,6,8,11,16],
                  'regressor__n_estimators' : [64, 128, 256, 512, 1024],
                  'regressor__learning_rate' : [0.1, 0.01, 0.001],
                  'preprocessor__numerical_preprocessor__imputer__strategy' : ['mean', 'median']}

In [None]:
grid_search = RandomizedSearchCV(model, parameter_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, n_iter=20)

In [None]:
grid_search.fit(X, y)

In [None]:
submission = grid_search.predict(predict[FEATURES])
submission = pd.DataFrame({'id':predict['id'], 'price':submission})
submission.to_csv('submission_csv',index=False)

## Model options

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()

model.fit(X,y)
X_predict = scaler.transform(predict_df[FEATURES])
predictions = model.predict(X_predict)

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth = 4)

model.fit(X,y)
X_predict = scaler.transform(predict_df[FEATURES])
predictions = model.predict(X_predict)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import datasets, linear_model

lasso = linear_model.Lasso()
lasso.fit(X,y)
X_predict = scaler.transform(predict_df[FEATURES])
predictions = model.predict(X_predict)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn import preprocessing


model= Pipeline(steps=[('regressor', RandomForestRegressor())])
model.fit(X,y)
X_predict = scaler.transform(predict_df[FEATURES])
predictions = model.predict(X_predict)