In [1]:
import pandas as pd
import numpy as np


#features extraction and encoding:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import string

#nltk for stopwords and tokenizer:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer


#visualization tools:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec

#sparse matrix:
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
from scipy.sparse import hstack, vstack

#regressor validation:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

#regressors to test:
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [None]:
df_dev=pd.read_csv('dev.tsv',sep='\t')
df_eval=pd.read_csv('eval.tsv',sep='\t')

### read me:
in order to reproduce the score submitted, the following block of code has to be executed:
 - 0.859 score: [1a] [2] [3] [4] [5] [6], [7]'Final encoding' section, [8]'best performer' section under 'final models attempt (and print out):' section
 - 0.859 score: [1a] [1b] [2] [3] [4] [5] [6] 'Final encoding' section, 'best performer' section under 'final models attempt (and print out):' section
 
other sections contains snippet of code discussed in the report or used for the graph. Since most of them did 

#### data visualization (graph used for the report):

In [None]:
sns.set(rc={'figure.figsize':(8.5,5)})
sns.set_style("ticks") 
sns.set_context("paper", font_scale=1.7) 


null_value=df_dev.isnull().sum().sort_values().values
null_label=df_dev.isnull().sum().sort_values().index

fig, ax = plt.subplots()
graph=sns.barplot(null_value,null_label,palette=["C0", "C1", "C2"],ax=ax)
ax.set_xlim(1,90000)
ax.set_xticks(np.arange(0,90000,10000))
ax.set(xlabel='total number of null values')
ax.set(ylabel='attributes')

plt.grid()
fig1 = plt.gcf()
plt.show()
#fig1.savefig("paper per report/grafici/nullValueCount.pdf",bbox_inches='tight')

In [None]:
sns.set(rc={'figure.figsize':(7,5)})
sns.set_style("ticks") 
sns.set_context("paper", font_scale=1.7)    

gs = gridspec.GridSpec(2, 1, height_ratios=[0.6, 2], ) 

ax0 = plt.subplot(gs[0])
g1=sns.boxplot(x=df_dev['quality'], width=0.4, ax=ax0)

# remove the tick labels
#g1.set(title='boxplot for the quality values')  # add a title
#g1.set(xlabel='quality values')  # remove the axis label

ax1 = plt.subplot(gs[1],sharex=ax0)
g2=sns.histplot(x=df_dev['quality'],ax=ax1,bins=50)
g2.set(xlabel='quality values')
g2.set(ylabel='frequency')

plt.subplots_adjust(hspace=.0)
plt.grid()
fig1 = plt.gcf()
plt.show()


fig1.savefig("paper per report/grafici/qualityCount2.pdf",bbox_inches='tight')

In [None]:
#text normalization sample
string=df_dev.loc[92093:92190,'designation']
normal=string.str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
print(string.values,normal.values)

description wordcloud:

In [None]:
lemmaTokenizer = LemmaTokenizer()
df['description']=df['description'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization

list_sw=stopwords.words() + list(string.punctuation) + ['st.',"'s",'wine','vine','tannin','flavor','fruit']
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer,stop_words=list_sw, use_idf=False, norm=False, binary=False, max_features=100, ngram_range=(1,3))
wpm = vectorizer.fit_transform(df['description'].fillna(''))
freq = sorted(zip(vectorizer.get_feature_names(), wpm.sum(axis=0).tolist()[0]),key=lambda x: x[1], reverse=True)

In [None]:
from wordcloud import WordCloud

d = {}
for a, x in freq:
    d[a] = x

Cloud = WordCloud(background_color="white", max_words=100,width=800, height=400).generate_from_frequencies(d)

plt.figure( figsize=(20,10) )
plt.imshow(Cloud, interpolation="bilinear")
plt.axis("off")
fig1 = plt.gcf()
plt.show()
fig1.savefig("paper per report/grafici/worddesc.png",bbox_inches='tight',dpi=300)

### df_dev preprocessing and cleaning:

# [1a]
quality 0 entries can be considered as noise: probably this value is missing and 0 was used as std value.

In [4]:
#1a
df_dev=df_dev[df_dev['quality']>0]

let's see if there's some similar entries/duplicate values: let's check if we have:
 - duplicated description
 - entries with all attributes value in common (even quality), except the description
 

In [None]:
df_dev[df_dev.duplicated(['description','quality'],keep=False)].sort_values(by='description')

In [None]:
df_dev[df_dev.duplicated(['country','designation','province','region_1','region_2','variety','winery','quality'],keep=False)].sort_values(by=['country','designation','province','region_1','region_2','variety','winery','quality'])

# [1b] 
let's drop the duplicates:

In [5]:
#1b
df_dev.drop_duplicates(subset=['description','quality'], inplace=True)

### attribute preprocessing:

# [2]

In [6]:
#2
df=pd.concat([df_dev,df_eval],sort=False,ignore_index=True)   #we merge together train and eval set

### Designation:

let's try to 1h encode the most frequent designation (at least N_entries). First, we normalize the text:

In [None]:
#discarded approach
df['designation']=df['designation'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization
designation=df['designation'].value_counts()

N_entries=4
desig_mask=designation.values>=N_entries
top_frequent_desig=designation[desig_mask].index

df_desig_mask=df['designation'].isin(top_frequent_desig)
df['tf_desig']=df['designation'][df_desig_mask]

# [3]
let's 1he encode the top frequent word:

In [7]:
#3
#text normalization step
df['designation']=df['designation'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization

class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        
    def __call__(self, document):
        lemmas = []
        for t in word_tokenize(document):
            t = t.strip()
            lemma = self.lemmatizer.lemmatize(t)
            lemmas.append(lemma)
        return lemmas


lemmaTokenizer = LemmaTokenizer()
list_sw=stopwords.words() + list(string.punctuation) + ['st.',"'s",'wine','vine','']
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer,stop_words=list_sw, use_idf=False, norm=False, binary=True)
wpm = vectorizer.fit_transform(df['designation'].fillna(''))

N = 5000
freq = sorted(zip(vectorizer.get_feature_names(), wpm.sum(axis=0).tolist()[0]),key=lambda x: x[1], reverse=True)[:N]
words = [ word for word, _ in freq ]  #we take the top N word
mask = [ w in words for w in vectorizer.get_feature_names() ]
words_ = [ w for w in vectorizer.get_feature_names() if w in words ]
desig_words_df=wpm[:, np.array(mask)].toarray()



target encoding trial:

In [None]:
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler

encoder = TargetEncoder(cols=['designation'])

dev_desig_te = encoder.fit_transform(df_dev['designation'],df_dev['quality'])
eval_desig_te = encoder.transform(df_eval['designation'])


df_te_desig=pd.concat([dev_desig_te,eval_desig_te],ignore_index=True)


### winery

# [4] 
we 1h encode the winery that has at least N entry:

In [8]:
#4
df['winery']=df['winery'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization

winery=df['winery'].value_counts()

N_entries=2
winery_mask=winery.values>=N_entries
top_frequent_winery=winery[winery_mask].index

df_winery_mask=df['winery'].isin(top_frequent_winery)
df['tf_winery']=df['winery'][df_winery_mask]

target encoder trial:

In [None]:
#discarded approach
from category_encoders import TargetEncoder
from sklearn.preprocessing import MinMaxScaler

encoder = TargetEncoder(cols=['winery'])

dev_winery_te = encoder.fit_transform(df_dev['winery'],df_dev['quality'])
eval_winery_te = encoder.transform(df_eval['winery'])

scaler = MinMaxScaler()

df_te_winery=pd.concat([dev_winery_te,eval_winery_te],ignore_index=True)
df_te_winery=scaler.fit_transform(df_te_winery)

### geografical information: country, region1, province

# [5]

let's check for rendundant values in province and region 1 attributes: if same values are found, we delete the value in region_1, so the encoder won't encode the same value two times.

In [9]:
#5
df['province']=df['province'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization
df['region_1']=df['region_1'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
province=df['province'].value_counts().index
region_1=df['region_1'].value_counts().index
common_value=np.intersect1d(province, region_1)
df.loc[df['province']==df['region_1'],'region_1']=np.nan

In [None]:
df[df['province']==df['region_1']] #query to check if all went well

### variety:

# [6] 

In [10]:
#6
df['variety']=df['variety'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization

variety=df['variety'].value_counts()

N_entries=7
variety_mask=variety.values>=N_entries
top_frequent_variety=variety[variety_mask].index

df_variety_mask=df['variety'].isin(top_frequent_variety)
df['tf_variety']=df['variety'][df_variety_mask]

tf-idf one hot:

In [None]:
#discarded approach, poor results
lemmaTokenizer = LemmaTokenizer()

df['variety']=df['variety'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') #text normalization
df['variety']=df['variety'].str.replace('-','  ')
df['variety']=df['variety'].str.replace(' - ','  ')

list_sw=stopwords.words() + list(string.punctuation) + ['st.',"'s",'wine','vine','-']
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer,stop_words=list_sw, use_idf=False, norm=False, binary=True)
wpm = vectorizer.fit_transform(df['variety'].fillna(''))

#here we don't need N because freq has a low number of features
freq = sorted(zip(vectorizer.get_feature_names(), wpm.sum(axis=0).tolist()[0]),key=lambda x: x[1], reverse=True)
words = [ word for word, _ in freq ]  #we take the top word
mask = [ w in words for w in vectorizer.get_feature_names() ]
words_ = [ w for w in vectorizer.get_feature_names() if w in words ]
variety_words_df=wpm[:, np.array(mask)].toarray()

### description:

All the approach tried below were discarded since they lead to worse results.

we can try to use the description to predict the null value in the designation columns, in order to train better the regressor:

In [None]:
df_description=df[['description','designation','province','variety','winery']]

predict_mask=df['designation'].isna()

df_desc_dev=df_description[~predict_mask]
df_desc_eval=df_description[predict_mask]

index_desig_predict=df_desc_eval.index

In [None]:
#normalization: df_dev
df_desc_dev['designation']=df_desc_dev['designation'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') 
df_desc_dev['province']=df_desc_dev['province'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_desc_dev['winery']=df_desc_dev['winery'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_desc_dev['variety']=df_desc_dev['variety'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') 

df_desc_eval['designation']=df_desc_eval['designation'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') 
df_desc_eval['province']=df_desc_eval['province'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_desc_eval['winery']=df_desc_eval['winery'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_desc_eval['variety']=df_desc_eval['variety'].str.lower().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8') 



In [None]:
N=20
topf_desig=df_desc_dev['designation'].value_counts()
topN_desig=topf_desig[topf_desig > N].index
top_f_desig_mask=df_desc_dev['designation'].isin(topN_desig)
df_desc_dev=df_desc_dev[top_f_desig_mask]

In [None]:
lemmaTokenizer = LemmaTokenizer()
list_sw=stopwords.words() + list(string.punctuation) + ['st.',"'s",'wine','vine','-']
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer,stop_words=list_sw, use_idf=False, norm=False, binary=True, ngram_range=(2,5),max_features=1000)

vectorizer.fit(df_desc_dev['description'].fillna(''))
wpm = vectorizer.transform(df_desc_dev['description'].fillna(''))
wpm_eval = vectorizer.transform(df_desc_eval['description'].fillna(''))

freq = sorted(zip(vectorizer.get_feature_names(), wpm.sum(axis=0).tolist()[0]),key=lambda x: x[1], reverse=True)
words = [ word for word, _ in freq ]  #we take the top N word
mask = [ w in words for w in vectorizer.get_feature_names() ]
words_ = [ w for w in vectorizer.get_feature_names() if w in words ]

description_words_df_desc =wpm[:, np.array(mask)].toarray()
description_words_df_desc_eval =wpm_eval[:, np.array(mask)].toarray()

In [None]:
#we have to encode the labels of the designation, wich will be our target variable
from sklearn.preprocessing import LabelEncoder
le =LabelEncoder()
y_dev=le.fit_transform(df_desc_dev['designation'])


In [None]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, vstack

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df_desc_dev[['province','variety','winery']])

desc_1h_dev=enc.transform(df_desc_dev[['province','variety','winery']])
desc_1h_eval=enc.transform(df_desc_eval[['province','variety','winery']])

X_dev=hstack([description_words_df_desc,desc_1h_dev])
X_eval=hstack([description_words_df_desc_eval,desc_1h_eval])


validation of the classifier:

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_dev,y_dev, test_size= 0.2, shuffle=True, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_valid)


acc=accuracy_score(y_valid, y_pred)

In [None]:
scores = cross_val_score(clf, X_dev, y_dev, cv=5)

In [None]:
scores

#### try prediction:

In [None]:
clf.fit(X_dev,y_dev)
desig_predicted=clf.predict(X_eval)

In [None]:
import collections, numpy
prediction=le.inverse_transform(desig_predicted)
c=collections.Counter(prediction)
c.most_common(20)

In [None]:
 #we insert prediction in index desig predict              
df.loc[index_desig_predict,'designation']=prediction

# [7]

## Final encoding

In [11]:
df_quality=df['quality']
df.drop(labels=['country','winery','variety','description','designation','quality','region_2'],axis=1,inplace=True)

In [12]:
tresh=df_dev.shape[0]

In [13]:
df

Unnamed: 0,province,region_1,tf_winery,tf_variety
0,alsace,cremant d'alsace,lucien albrecht,pinot blanc
1,california,paso robles,castle rock,cabernet sauvignon
2,oregon,willamette valley,chateau bianca,gewurztraminer
3,alentejano,,herdade do esporao,touriga nacional
4,southern italy,pompeiano,sorrentino,coda di volpe
...,...,...,...,...
115186,california,napa valley,lail,bordeaux-style red blend
115187,california,dry creek valley,mounts,cabernet franc
115188,california,santa barbara county,tercero,g-s-m
115189,polkadraai hills,,stellenbosch hills,


In [14]:
df_1h=pd.get_dummies(df,sparse=True)   #one hot encoding of the categorical attribute

In [15]:
df_1h=hstack([df_1h,desig_words_df])

In [16]:
df_1h=df_1h.tocsr()

In [17]:
X_dev=df_1h[:tresh,:]
y_dev=df_quality[:tresh]

X_eval=df_1h[tresh:,:]


In [18]:
df_1h.shape

(115191, 17940)

### features reduction:

discarded approach; got only worse results

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=200, random_state=42)
df_1h = svd.fit_transform(X_dev)


In [None]:
import matplotlib.pyplot as plt
plt.plot(svd.explained_variance_ratio_,marker='o', linestyle='')

## models evaluation and search:

various models will be tested in order to find the best performer on the scoreboard. both old-out validation and cross validation were performed when possible.

In [19]:
#hold out:
X_train, X_valid, y_train, y_valid = train_test_split(X_dev,y_dev, test_size= 0.2, shuffle=True, random_state=42)

### 1) linear svr

grid search:

In [20]:
reg = LinearSVR()
param_grid = {'epsilon':[3,5], 'C':[4,5,6,7], 'fit_intercept':[True,False],'max_iter':[1500,2000]}
gridsearch = GridSearchCV(reg, param_grid, scoring='r2', cv=5)

gridsearch.fit(X_dev, y_dev)
print(gridsearch.best_params_)

{'C': 7, 'epsilon': 3, 'fit_intercept': True, 'max_iter': 1500}


cross validation:

In [21]:
reg = LinearSVR(epsilon=3, C=6, max_iter=5000)
scores_lsvr = cross_val_score(reg, X_dev, y_dev, cv=20, scoring='r2')
print(scores_lsvr)

[0.74516161 0.71376035 0.71337893 0.74137182 0.72481427 0.72683652
 0.7177324  0.71077976 0.73073225 0.72308581 0.71428657 0.72991698
 0.72912269 0.72569897 0.71608954 0.72275414 0.70854414 0.7357361
 0.71484268 0.73223929]


In [22]:
scores_lsvr.std()

0.009966621807733689

hold-out validation:

In [None]:
reg = LinearSVR(epsilon=3, C=6, max_iter=5000)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_valid)

r2 = r2_score(y_valid, y_pred)
print(r2)

#### 2) ridge regressor

grid search:

In [23]:
reg = Ridge()
param_grid = {'alpha':[0.5,1,10], 'max_iter':[1000,2000,5000], 'solver':['sparse_cg','sag']}
gridsearch = GridSearchCV(reg, param_grid, scoring='r2', cv=5)

gridsearch.fit(X_dev, y_dev)
print(gridsearch.best_params_)

{'alpha': 1, 'max_iter': 1000, 'solver': 'sparse_cg'}


cross validation:

In [24]:
reg = Ridge(alpha=1,solver='sparse_cg',max_iter=1000)
scores_ridge= cross_val_score(reg, X_dev, y_dev, cv=20, scoring='r2')

In [25]:
scores_ridge

array([0.75395561, 0.7198657 , 0.71942999, 0.74749069, 0.7336794 ,
       0.73563732, 0.72429293, 0.71913457, 0.74277626, 0.73056684,
       0.72159704, 0.73674058, 0.73962034, 0.7317529 , 0.73076569,
       0.73087937, 0.71889236, 0.74497876, 0.7237664 , 0.74127446])

In [None]:
scores_ridge.std()

hold-out validation:

In [None]:
reg = Ridge(alpha=1,solver='sparse_cg',max_iter=1000)
reg.fit(X_train, y_train)
y_pred=reg.predict(X_valid)
r2 = r2_score(y_valid, y_pred)
print(r2)

#### 3) random forest regressor (feasible only when target encoding is used)

In [None]:
reg = RandomForestRegressor(n_estimators=1000, max_depth=30, max_features='sqrt')
reg.fit(X_train, y_train)
y_pred = reg.predict(X_valid)
r2 = r2_score(y_valid, y_pred)
print(r2)

#### 4) mlp regressor

several mlp config were tested, here we only report few of them

In [None]:
reg = MLPRegressor(hidden_layer_sizes=(10,10,10,64),
                                learning_rate='adaptive',
                                learning_rate_init=0.1,
                                max_iter=10000,
                                random_state=42,
                                verbose=True,
                                early_stopping=True,
                   n_iter_no_change=20,
                   tol=0.0001
                            
                
                               )
reg.fit(X_dev, y_dev)
y_pred = reg.predict(X_eval)
                   


In [None]:
reg = MLPRegressor(hidden_layer_sizes=(64,128,128,516,516),
                                random_state=42,
                                verbose=True,
                                early_stopping=True
                               )
                   
reg.fit(X_dev, y_dev)
y_pred = reg.predict(X_eval)

In [None]:
reg = MLPRegressor(hidden_layer_sizes=(64,128,128,128,256,256,256,516,516,516,1024),
                                random_state=42,
                                verbose=True,
                                early_stopping=True
                               )
                   
reg.fit(X_dev, y_dev)
y_pred = reg.predict(X_eval)

## final models attempt (and print out):

In [None]:
reg = LinearSVR(epsilon=3, C=6, max_iter=1500)
reg.fit(X_dev, y_dev)
y_pred = reg.predict(X_eval)

pd.DataFrame(y_pred).to_csv("output_svr.csv",index_label="Id", header=["Predicted"])

In [None]:
reg = Ridge(alpha=1,solver='sparse_cg',max_iter=1000)
reg.fit(X_dev, y_dev)

y_pred = reg.predict(X_eval)
pd.DataFrame(y_pred).to_csv("output_ridge.csv",index_label="Id", header=["Predicted"])

## [8] best performer:

In [None]:
reg = MLPRegressor(hidden_layer_sizes=(64,128,128,128,256,256,256,516,516,516,1024),
                                random_state=42,
                                verbose=True,
                                early_stopping=True
                               )
                   
reg.fit(X_dev, y_dev)
y_pred = reg.predict(X_eval)

pd.DataFrame(y_pred).to_csv("output_mlp.csv",index_label="Id", header=["Predicted"])