<h1 align=center style="line-height:200%;font-family:vazir;color:#0099cc">
<font face="vazir" color="#0099cc">
Regression linear model with preprocessing</font>
</h1>

In [1]:
import numpy as np
import pandas as pd 


<center>
<div dir=rtl style="direction: rtl;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=3>
    
|ستون|توضیحات|
|:------:|:---:|
|<code>carat</code>|وزن الماس به معیار قیراط|
|<code>cut</code>|کیفیت تراش‌خوردگی الماس|
|<code>color</code>|رنگ الماس از J(بدترین) تا D (بهترین)|
|<code>clarity</code>|معیار شفافیت الماس (از چپ به راست از بدترین به بهترین: I1, SI2, SI1, VS2, VS1, VVS2, VVS1, IF)
|<code>x</code>|طول الماس به میلی‌متر|
|<code>y</code>|عرض الماس به میلی‌متر|
|<code>z</code>|عمق الماس به میلی‌متر|
|<code>depth</code>|عمق درصدی که برابر است با z / mean(x,y)|
|<code>table</code>|عرض عریض‌ترین نقطه بالایی الماس|
|<code>price</code>|قیمت الماس|

</font>
</div>
</center>

In [2]:
train = pd.read_csv('../data/diamonds_train.csv')
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
test = pd.read_csv('../data/diamonds_test.csv')
test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.73,Ideal,G,VVS2,61.9,55.0,5.83,5.77,3.59
1,0.61,Premium,F,VVS2,59.7,58.0,5.56,5.53,3.31
2,1.55,Premium,I,VS1,58.2,60.0,7.69,7.59,4.45
3,0.46,Good,F,IF,56.2,61.0,5.16,5.24,2.92
4,1.1,Very Good,F,VS2,60.6,58.0,6.67,6.77,4.07


In [4]:
# Do some preprocessing!
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import math

In [5]:
label = train['price']
train.drop(columns=['price'], inplace=True)

train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
49995,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
49996,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
49997,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
49998,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [6]:
categorical_columns = ['cut', 'clarity', 'color']
numerical_columns = ['carat','depth','table','x','y','z']

In [7]:
train.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [8]:
data_train, data_valid, target_train, target_valid = train_test_split(
    train[numerical_columns + categorical_columns],
    label,
    random_state=42,
    test_size=0.25
)

data_train

Unnamed: 0,carat,depth,table,x,y,z,cut,clarity,color
27434,0.41,62.8,57.0,4.71,4.75,2.97,Very Good,SI1,E
13400,1.01,61.9,59.0,6.38,6.34,3.94,Premium,VS2,G
883,0.86,61.8,55.0,6.12,6.14,3.79,Ideal,SI2,H
7303,1.08,63.2,57.0,6.54,6.50,4.12,Very Good,SI2,G
45124,0.77,61.8,59.0,5.86,5.82,3.61,Premium,SI1,J
...,...,...,...,...,...,...,...,...,...
11284,1.00,61.9,56.0,6.36,6.41,3.95,Ideal,SI1,F
44732,0.52,61.6,54.0,5.17,5.22,3.20,Ideal,VVS2,G
38158,0.40,62.5,60.0,4.72,4.66,2.93,Premium,VS1,D
860,0.73,61.4,56.0,5.79,5.81,3.56,Ideal,SI1,E


In [9]:
numeric_preprocessor = Pipeline(
    steps=[
        ('imputation_mean', SimpleImputer(missing_values=math.nan, strategy='mean')),
        ('scaler', StandardScaler())
    ])

categorical_preprocessor = Pipeline(
    steps=[
        ('imputation_constant', SimpleImputer(fill_value='missing', strategy='constant')),
        ('onhot',OneHotEncoder(handle_unknown='ignore'))
    ])

In [10]:
preprocessor = ColumnTransformer(
        transformers=[
            ('num_preprocessor', numeric_preprocessor, numerical_columns),
            ('cat_preprocessor', categorical_preprocessor, categorical_columns)
        ],
    remainder='drop',
    n_jobs=-1
)
numerical_columns

['carat', 'depth', 'table', 'x', 'y', 'z']

In [11]:
pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
    ]
)
pipe

In [12]:
pipe.fit(data_train, target_train)



In [13]:
X_train_processed = pipe.fit_transform(data_train, target_train)


In [14]:
X_valid_trans = pipe.transform(data_valid)
X_valid_trans

array([[-0.40095617, -0.38803325, -0.11714296, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.22828817,  1.63314748,  1.13820359, ...,  0.        ,
         0.        ,  0.        ],
       [-0.06535919,  0.30892563, -0.65514863, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.96727607,  0.16953385,  0.68986554, ...,  0.        ,
         0.        ,  0.        ],
       [-0.98825089,  0.16953385, -0.20681057, ...,  0.        ,
         0.        ,  0.        ],
       [-0.86240202, -1.08499212, -0.65514863, ...,  0.        ,
         0.        ,  0.        ]])

In [15]:
X_Test_trans = pipe.transform(test)
X_Test_trans

array([[-0.14925843,  0.09983796, -1.10348668, ...,  0.        ,
         0.        ,  0.        ],
       [-0.40095617, -1.43347156,  0.24152748, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.57067609, -2.47890986,  1.13820359, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.68973402, -1.99103865,  0.24152748, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.48677684,  0.72710095, -0.65514863, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.57067609, -0.73651268, -1.10348668, ...,  0.        ,
         0.        ,  0.        ]])

In [16]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train_processed, target_train)

In [17]:
from sklearn.metrics import r2_score

valid_pred = model.predict(X_valid_trans)

r2_score_valid = r2_score(target_valid, valid_pred)

r2_score_valid

0.9238758541025666

In [18]:
test_pred = model.predict(X_Test_trans)
submission = pd.DataFrame(test_pred).rename(columns={0: 'price'})
submission

Unnamed: 0,price
0,4312.969371
1,3453.941603
2,10343.883424
3,2571.000420
4,7048.257877
...,...
3935,10661.422104
3936,5499.878540
3937,6998.947041
3938,11055.030004
