# Descriptive Statistics Review

In this second part of the lab, we are going to continue working with the data that we cleaned in the last part. 
Be sure to continue to write clean code and comment your work well!

First, lets import our libraries and the data we saved. 

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split

import pickle as pk

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## Import file, dropping non-ordinal colums

In [None]:
diamonds = pd.read_csv('diamonds_data.csv')
diamonds = diamonds.loc[:, diamonds.dtypes == 'float64']

Diamonds           = diamonds.loc[:, diamonds.columns != 'price']
diamonds_log_price = diamonds['log_price']

# Do the regression

## Train test splitting

In [None]:
Diamonds_train, Diamonds_test, diamonds_price_train, diamonds_price_test = \
    train_test_split(
        Diamonds, diamonds_log_price
    )

## Model

In [None]:
columns = ['log_carat', 'depth', 'table', 'clarity_idx', 'color_idx', 'cut_idx', 'x', 'y', 'z', 'pca_x_y_z']

def add_level(lst = [], i = 0, l = 0):
    if l == len(columns):
        return lst

    if l == 0:
        lst = [[columns[0]]]
        lst = add_level(lst, i, l + 1)
    else:
        curr_idx = len(lst)

        for j in range(l, len(columns)):
            if columns.index(lst[i][-1]) < j:
                lst.append(lst[i] + [columns[j]])

        for j in range(curr_idx, len(lst)):
            lst = add_level(lst, j, l + 1)
        
    return lst

tries = \
    pd.DataFrame(
        index = [', '.join(c) for c in add_level()],
        columns = ['MSE', 'R2']
    )

In [None]:
for idx in tries.index:
    columns = idx.split(', ')
    
    model = LinearRegression()
    model.fit(Diamonds_train[columns], diamonds_price_train)
    
    data = Diamonds_test[columns]    
    price_predicted = model.predict(data)
    
    tries.loc[idx]['MSE'] = \
        np.sqrt(
            mean_squared_error(
                    np.exp(diamonds_price_test),
                    np.exp(price_predicted)
            )
        )
    
    tries.loc[idx]['R2'] = \
        r2_score(
            np.exp(diamonds_price_test),
            np.exp(price_predicted)
        )

In [None]:
tries.sort_values(by='R2', ascending=False).head()

In [None]:
tries.sort_values(by='MSE', ascending=True).head()

# Do the regression based on Clarity

In [None]:
#model_columns = ['log_carat', 'depth', 'table', 'color_idx', 'cut_idx']
model_columns = ['log_carat', 'color_idx', 'cut_idx']


In [None]:
diamonds_clarity_split = {}

for clarity in diamonds.clarity_idx.unique():
    data   = diamonds[diamonds.clarity_idx == clarity]
    #
    featrs = data.loc[:, model_columns]
    prices = data['log_price']    
    
    Train, Test, train, test = train_test_split(featrs, prices)
        
    diamonds_clarity_split[clarity] = \
        {
            'data': data,
            #
            'Train': Train, 'Test': Test,
            'train': train, 'test': test
        }

In [None]:
split_result   = []
split_expected = []

for clarity in diamonds.clarity_idx.unique():
    split = diamonds_clarity_split[clarity]
    #
    split_Train = split['Train']
    split_train = split['train']
    #
    split_Test = split['Test']
    split_test = split['test']
    
    model = LinearRegression()
    model.fit(split_Train, split_train)
    
    price_predicted = model.predict(split_Test)
    
    split_result.extend(np.exp(price_predicted))
    split_expected.extend(np.exp(split_test))

In [None]:
print(
    np.sqrt(
        mean_squared_error(split_result, split_expected)
    )
)

print(
    r2_score(split_result, split_expected)
)

sns.distplot(split_result)
sns.distplot(split_expected)

# Apply!

In [None]:
rick = pd.read_csv('rick_diamonds.csv')

## Qualify

In [None]:
clarities = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

rick['clarity_idx'] = [clarities.index(c) for c in rick.clarity]

In [None]:
colors = ['J', 'I', 'H', 'G', 'F', 'E', 'D']

rick['color_idx'] = [colors.index(c) for c in rick.color]

In [None]:
cuts = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']

rick['cut_idx'] = [cuts.index(c) for c in rick.cut]

## PCA for x, y, z

In [None]:
pca_data = rick[['x', 'y', 'z']]

rick['pca_x_y_z'] = \
    pk \
        .load(
                open('pca_x_y_z.pkl','rb')
            ) \
        .transform(pca_data)

## Log for carat

In [None]:
rick['log_carat'] = np.log(rick.carat)

## Run model

In [None]:
rick['price_predicted'] = np.nan

for clarity in diamonds.clarity_idx.unique():
    data      = diamonds[diamonds.clarity_idx == clarity]
    rick_data = rick[rick.clarity_idx == clarity]
    
    model = LinearRegression()
    model.fit(data[model_columns], data.log_price)
    
    rick.loc[rick_data.index, ['price_predicted']] = \
        np.exp(
            model.predict(rick_data[model_columns])
        )

In [None]:
sns.distplot(rick['price_predicted'])

## Output

In [None]:
rick.to_csv('diamonds_rick_log.csv', index=False)