In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Plot / Graph stuffs
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import sklearn.metrics as metrics #import confusion_matrix, classification_report

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../data_root/raw"]).decode("utf8"))

In [None]:
import pickle
import pandas as pd
import sklearn.metrics as metrics

loaded_model = pickle.load(open('../data_root/model/model.pickle', 'rb'))

in_data = '../data_root/processed/test.parquet.gzip'

test = pd.read_parquet(in_data)

X_test = test.iloc[:,:-1]

y_test = test.iloc[:,-1]

predictions = loaded_model.predict(X_test)

print("Results of random regressor on price and description length only:" )
print("explained_variance_score: ", metrics.explained_variance_score(y_test, predictions)) #Explained variance regression score function
print("max_error: ", metrics.max_error(y_test, predictions)) #max_error metric calculates the maximum residual error.
print("mean absolute error: ", metrics.mean_absolute_error(y_test, predictions)) #Mean absolute error regression loss
print("mean squared error: ", metrics.mean_squared_error(y_test, predictions)) #Mean squared error regression loss
print("mean squared log error: ", metrics.mean_squared_log_error(y_test, predictions)) #Mean squared logarithmic error regression loss
print("median absolute error: ", metrics.median_absolute_error(y_test, predictions)) #Median absolute error regression loss
print("R2 score: ", metrics.r2_score(y_test, predictions)) #R^2 (coefficient of determination) regression score function.


In [None]:
input_csv = '../data_root/raw/wine_dataset.csv'
init_data = pd.read_csv(input_csv, index_col= 0)

print("Number of rows before processing:", len(init_data))
print()
print("Summary of numerical columns: ")
print(init_data.describe())
print()
print("Summary of missing data: ")
print(init_data.isna().sum())
print()
print("Sample data: ")
init_data.head()


In [None]:
selected_data = init_data[['country', 'description', 'points', 'price', 
    'province', 'title', 'variety','winery']]

deduped_data = selected_data[~selected_data.duplicated()]
print("Number of rows after removing duplicates:",
      len(deduped_data))

data = deduped_data.dropna()
print("Number of rows after removing missing data:" , len(data))

In [None]:
fig, ax = plt.subplots(figsize=(30,10))
plt.xticks(fontsize=20) # X Ticks
plt.yticks(fontsize=20) # Y Ticks
ax.set_title('Number of wines per points', fontweight="bold", size=25) # Title
ax.set_ylabel('Number of wines', fontsize = 25) # Y label
ax.set_xlabel('Points', fontsize = 25) # X label
data.groupby(['points']).count()['description'].plot(ax=ax, kind='bar')

In [None]:
data.describe()

In [None]:
data = data.assign(description_length = data['description'].apply(len))

fig, ax = plt.subplots(figsize=(30,10))
sns.boxplot(x='points', y='description_length', data=data)
plt.xticks(fontsize=20) # X Ticks
plt.yticks(fontsize=20) # Y Ticks
ax.set_title('Description Length per Points', fontweight="bold", size=25) # Title
ax.set_ylabel('Description Length', fontsize = 25) # Y label
ax.set_xlabel('Points', fontsize = 25) # X label
plt.show()

In [None]:
#Transform method taking points as param
def transform_points_simplified(points):
    if points < 86:
        return 1
    elif points >= 86 and points < 88:
        return 2 
    elif points >= 88 and points < 91:
        return 3 
    elif points >= 91:
        return 4 
    else:
        return 0

#Applying transform method and assigning result to new column "points_simplified"
data = data.assign(points_simplified = data['points'].apply(transform_points_simplified))

In [None]:
fig, ax = plt.subplots(figsize=(30,10))
plt.xticks(fontsize=20) # X Ticks
plt.yticks(fontsize=20) # Y Ticks
ax.set_title('Number of wines per points', fontweight="bold", size=25) # Title
ax.set_ylabel('Number of wines', fontsize = 25) # Y label
ax.set_xlabel('Points', fontsize = 25) # X label
data.groupby(['points_simplified']).count()['description'].plot(ax=ax, kind='bar')

In [None]:
fig, ax = plt.subplots(figsize=(30,10))
sns.boxplot(x='points_simplified', y='description_length', data=data)
plt.xticks(fontsize=20) # X Ticks
plt.yticks(fontsize=20) # Y Ticks
ax.set_title('Description Length per Points', fontweight="bold", size=25) # Title
ax.set_ylabel('Description Length', fontsize = 25) # Y label
ax.set_xlabel('Points', fontsize = 25) # X label
plt.show()

In [None]:
cat_features = [
    'country',
    'province',
    'variety',
    'winery',
]

num_features = [
    'price',
    'description_length'
]

labels = ['points']


In [None]:
X_num = data[num_features]
y = data[labels]


# Training model
X_num_train, X_num_test, y_num_train, y_num_test = train_test_split(
    X_num, y, test_size=0.1, random_state=101
)

rfr_num = RandomForestRegressor()
rfr_num.fit(X_num_train, y_num_train)

# Testing model
predictions_num = rfr_num.predict(X_num_test)

In [None]:
#' % Linear Regression model with Python
#' % Matti Pastell
#' % 19.4.2013

#' #Requirements
#' This en example of doing linear regression analysis using Python
#' and [statsmodels](http://statsmodels.sourceforge.net). We'll use the new formula API
#' which makes fitting the models very familiar for R users.
#' You'll also need [Numpy](http://www.numpy.org/), [Pandas](http://pandas.pydata.org/)
#' and [matplolib](http://matplotlib.org/).

#' The analysis can be published using  Pweave 0.22 and later.

#' Import libraries

import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import statsmodels
import matplotlib.pyplot as plt

#' Statsmodels api seems to change often, check release version:
#+ term=True

statsmodels.__version__


#' We'll use [whiteside](http://stat.ethz.ch/R-manual/R-patched/library/MASS/html/whiteside.html) dataset from R package MASS. You can read the description of the dataset from the link, but in short it contains:

#' >*The weekly gas consumption and average external temperature at a house in south-east England for two
#' heating seasons, one of 26 weeks before, and one of 30 weeks after cavity-wall insulation was installed.*

#' Load dataset using Pandas:

url = 'https://raw.githubusercontent.com/mpastell/Rdatasets/master/csv/MASS/whiteside.csv'
whiteside = pd.read_csv(url)

#' # Fitting the model
#' Let's see what the relationship between the gas consumption is before the insulation.
#' See [statsmodels documentation](http://statsmodels.sourceforge.net/devel/example_formulas.html)
#' for more information about the syntax.

model = sm.ols(formula='Gas ~ Temp', data=whiteside, subset = whiteside['Insul']=="Before")
fitted = model.fit()
print(fitted.summary())

#' # Plot the data and fit

Before = whiteside[whiteside["Insul"] == "Before"]
plt.plot(Before["Temp"], Before["Gas"], 'ro')
plt.plot(Before["Temp"], fitted.fittedvalues, 'b')
plt.legend(['Data', 'Fitted model'])
plt.ylim(0, 10)
plt.xlim(-2, 12)
plt.xlabel('Temperature')
plt.ylabel('Gas')
plt.title('Before Insulation')

#' # Fit diagnostiscs
#' Statsmodels [OLSresults](http://statsmodels.sourceforge.net/devel/generated/statsmodels.regression.linear_model.OLSResults.html) objects contain the usual diagnostic information about the model and you can use the `get_influence()` method to get more diagnostic information (such as Cook's distance).

#' ## A look at the residuals
#' Histogram of normalized residuals

plt.hist(fitted.resid_pearson)
plt.ylabel('Count')
plt.xlabel('Normalized residuals')


#' ## Cooks distance

#' [OLSInfluence](http://statsmodels.sourceforge.net/devel/generated/statsmodels.stats.outliers_influence.OLSInfluence.html)
#'  objects contain more diagnostic information

influence = fitted.get_influence()
#c is the distance and p is p-value
(c, p) = influence.cooks_distance
plt.stem(np.arange(len(c)), c, markerfmt=",")


#' # Statsmodels builtin plots

#' Statsmodels includes a some builtin function for plotting residuals against leverage:

from statsmodels.graphics.regressionplots import *
plot_leverage_resid2(fitted)
influence_plot(fitted)

In [None]:
print("Results of random regressor on price and description length only:" )
print("explained_variance_score: ", metrics.explained_variance_score(y_num_test, predictions_num)) #Explained variance regression score function
print("max_error: ", metrics.max_error(y_num_test, predictions_num)) #max_error metric calculates the maximum residual error.
print("mean absolute error: ", metrics.mean_absolute_error(y_num_test, predictions_num)) #Mean absolute error regression loss
print("mean squared error: ", metrics.mean_squared_error(y_num_test, predictions_num)) #Mean squared error regression loss
print("mean squared log error: ", metrics.mean_squared_log_error(y_num_test, predictions_num)) #Mean squared logarithmic error regression loss
print("median absolute error: ", metrics.median_absolute_error(y_num_test, predictions_num)) #Median absolute error regression loss
print("R2 score: ", metrics.r2_score(y_num_test, predictions_num)) #R^2 (coefficient of determination) regression score function.


In [None]:
X_cat = data[cat_features]

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

enc.fit(X_cat)
X_cat_tfm = enc.transform(X_cat).toarray()
X_cat_tfm = X_cat_tfm[:, np.where(np.sum(X_cat_tfm, axis=0) > 10)[0]]

# Training model
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_cat_tfm, y, test_size=0.1, random_state=101)
rfr_cat = RandomForestRegressor()
rfr_cat.fit(X_cat_train, y_cat_train)

# Testing model
predictions_cat = rfr_cat.predict(X_cat_test)

In [None]:
print("Results of random regressor on", ", ".join(cat_features), "only:" )
print("explained_variance_score: ", metrics.explained_variance_score(y_cat_test, predictions_cat)) #Explained variance regression score function
print("max_error: ", metrics.max_error(y_cat_test, predictions_cat)) #max_error metric calculates the maximum residual error.
print("mean absolute error: ", metrics.mean_absolute_error(y_cat_test, predictions_cat)) #Mean absolute error regression loss
print("mean squared error: ", metrics.mean_squared_error(y_cat_test, predictions_cat)) #Mean squared error regression loss
print("mean squared log error: ", metrics.mean_squared_log_error(y_cat_test, predictions_cat)) #Mean squared logarithmic error regression loss
print("median absolute error: ", metrics.median_absolute_error(y_cat_test, predictions_cat)) #Median absolute error regression loss
print("R2 score: ", metrics.r2_score(y_cat_test, predictions_cat)) #R^2 (coefficient of determination) regression score function.


In [None]:
from scipy.sparse import hstack

X_all = hstack([X_cat_tfm,X_num])

# Training model
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(X_all, y, test_size=0.1, random_state=101)
rfr_all = RandomForestRegressor()
rfr_all.fit(X_all_train, y_all_train)

# Testing model
predictions_all = rfr_all.predict(X_all_test)

In [None]:
print("Results of random regressor on all features except description:" )
print("explained_variance_score: ", metrics.explained_variance_score(y_all_test, predictions_all)) #Explained variance regression score function
print("max_error: ", metrics.max_error(y_all_test, predictions_all)) #max_error metric calculates the maximum residual error.
print("mean absolute error: ", metrics.mean_absolute_error(y_all_test, predictions_all)) #Mean absolute error regression loss
print("mean squared error: ", metrics.mean_squared_error(y_all_test, predictions_cat)) #Mean squared error regression loss
print("mean squared log error: ", metrics.mean_squared_log_error(y_all_test, predictions_all)) #Mean squared logarithmic error regression loss
print("median absolute error: ", metrics.median_absolute_error(y_all_test, predictions_all)) #Median absolute error regression loss
print("R2 score: ", metrics.r2_score(y_all_test, predictions_all)) #R^2 (coefficient of determination) regression score function.

In [None]:
#X = data['description']
#y = data['points']

#vectorizer = CountVectorizer()
#vectorizer.fit(X)

In [None]:
#X = vectorizer.transform(X)
#print('Shape of Sparse Matrix: ', X.shape)
#print('Amount of Non-Zero occurrences: ', X.nnz)
# Percentage of non-zero values
#density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))
#print('Density: {}'.format((density)))

In [None]:
# Training the model
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)
#rfc = RandomForestRegressor(n_estimators=100,
                            min_samples_leaf=2,
                            max_features="sqrt")
#rfc.fit(X_train, y_train)

# Testing the model
#predictions = rfc.predict(X_test)

In [None]:
#print("explained_variance_score: ", metrics.explained_variance_score(y_test, predictions)) #Explained variance regression score function
#print("max_error: ", metrics.max_error(y_test, predictions)) #max_error metric calculates the maximum residual error.
#print("mean absolute error: ", metrics.mean_absolute_error(y_test, predictions)) #Mean absolute error regression loss
#print("mean squared error: ", metrics.mean_squared_error(y_test, predictions)) #Mean squared error regression loss
#print("mean squared log error: ", metrics.mean_squared_log_error(y_test, predictions)) #Mean squared logarithmic error regression loss
#print("median absolute error: ", metrics.median_absolute_error(y_test, predictions)) #Median absolute error regression loss
#print("R2 score: ", metrics.r2_score(y_test, predictions)) #R^2 (coefficient of determination) regression score function.


In [None]:
#X = data['description']
#y = data['points_simplified']

# Vectorizing model
#vectorizer = TfidfVectorizer()
#vectorizer.fit(X)
#X = vectorizer.transform(X)

In [None]:
# Training model
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)
#rfc = RandomForestClassifier()
#rfc.fit(X_train, y_train)

# Testing model
#predictions = rfc.predict(X_test)
#print(classification_report(y_test, predictions))