In [17]:
import pandas as pd
import numpy as np


#features extraction and encoding:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import string

#nltk for stopwords and tokenizer:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer


#visualization tools:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec

#sparse matrix:
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
from scipy.sparse import hstack, vstack

#regressor validation:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

#regressors to test:
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [18]:
df_dev=pd.read_csv('dev.tsv',sep='\t')
df_eval=pd.read_csv('eval.tsv',sep='\t')

### read me:

this code is able to quickly reproduce the 0.859 public score submitted. If the line in section 1a is uncommented, then this code reproduce the 0.846 public score result. Output can be found in output_mlp.csv. In the solution 2 file can be found all the validation step, visualization code and all the material discussed in the report.

#### df_dev cleaning:

In [19]:
#1a
df_dev=df_dev[df_dev['quality']>0]
#[uncomment line below for 0.846 score]# 
# df_dev.drop_duplicates(subset=['description','quality'], inplace=True)

#### attribute preprocessing:

In [20]:
df=pd.concat([df_dev,df_eval],sort=False,ignore_index=True)   #we merge together train and eval set

designation:

In [21]:
#3
#text normalization step
df['designation']=df['designation'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        
    def __call__(self, document):
        lemmas = []
        for t in word_tokenize(document):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t)
            lemmas.append(lemma)
        return lemmas


lemmaTokenizer = LemmaTokenizer()
list_sw=stopwords.words() + list(string.punctuation) + ['st.',"'s",'wine','vine','']
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer,stop_words=list_sw, use_idf=False, norm=False, binary=True)
wpm = vectorizer.fit_transform(df['designation'].fillna(''))

N = 5000
freq = sorted(zip(vectorizer.get_feature_names(), wpm.sum(axis=0).tolist()[0]),key=lambda x: x[1], reverse=True)[:N]
words = [ word for word, _ in freq ]  #we take the top N word
mask = [ w in words for w in vectorizer.get_feature_names() ]
words_ = [ w for w in vectorizer.get_feature_names() if w in words ]
desig_words_df=wpm[:, np.array(mask)].toarray()



winery:

In [22]:
#4
df['winery']=df['winery'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization

winery=df['winery'].value_counts()

N_entries=2
winery_mask=winery.values>=N_entries
top_frequent_winery=winery[winery_mask].index

df_winery_mask=df['winery'].isin(top_frequent_winery)
df['tf_winery']=df['winery'][df_winery_mask]

geografical information: country, region1, province:

In [23]:
#5
df['province']=df['province'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization
df['region_1']=df['region_1'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
province=df['province'].value_counts().index
region_1=df['region_1'].value_counts().index
common_value=np.intersect1d(province, region_1)
df.loc[df['province']==df['region_1'],'region_1']=np.nan

variety:

In [24]:
#6
df['variety']=df['variety'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization

variety=df['variety'].value_counts()

N_entries=7
variety_mask=variety.values>=N_entries
top_frequent_variety=variety[variety_mask].index

df_variety_mask=df['variety'].isin(top_frequent_variety)
df['tf_variety']=df['variety'][df_variety_mask]

#### final encoding:

In [25]:
df_quality=df['quality']
df.drop(labels=['country','winery','variety','description','designation','quality','region_2'],axis=1,inplace=True)

tresh=df_dev.shape[0]

df_1h=pd.get_dummies(df,sparse=True)   #one hot encoding of the categorical attribute
df_1h=hstack([df_1h,desig_words_df])
df_1h=df_1h.tocsr()

X_dev=df_1h[:tresh,:]
y_dev=df_quality[:tresh]

X_eval=df_1h[tresh:,:]

#### final model:

In [None]:
reg = MLPRegressor(hidden_layer_sizes=(64,128,128,128,256,256,256,516,516,516,1024),
                                random_state=42,
                                verbose=True,
                                early_stopping=True
                               )
                   
reg.fit(X_dev, y_dev)
y_pred = reg.predict(X_eval)

pd.DataFrame(y_pred).to_csv("output_mlp.csv",index_label="Id", header=["Predicted"])