In [178]:
import pandas as pd
import numpy as np

df_dev = pd.read_csv('./dev.tsv',sep='\t')
df_eval = pd.read_csv('./eval.tsv',sep='\t')
df_dev.drop_duplicates(inplace=True)
len(df_dev), len(df_eval)

(85028, 30186)

In [179]:
#drop outliers
df = pd.concat([df_dev, df_eval], sort=False, ignore_index=True)
index = df[df['quality'] == 0].index
df.drop(index = index, inplace = True)

In [180]:
# substitution of province in region_1
index_nan = df[df["region_1"].isna()].index

df.loc[index_nan, "region_1"] = df.loc[index_nan,"province"]
# df.loc[index_nan, "province"] = df.loc[index_nan,"country"]

index_nan = df[df["province"].isna()].index
df.drop(index = index_nan, inplace=True)
train_valid_mask = ~df["quality"].isna()
df.shape

(115201, 9)

In [181]:
# REGION_1 CLEANING
limit = 50
df_1 = df
for prov in df_1['province'].unique():
  regions = df_1.loc[df_1['province'] == prov, 'region_1']
  to_rename = pd.value_counts(regions)[pd.value_counts(regions) < limit]
  for el in to_rename.index:
    df_1.loc[df_1['region_1'] == el,'region_1'] = f'{prov} other'

In [182]:
# REGION_1 CLEANING 2
limit = 50
df_1 = df
for cou in df_1['country'].unique():
  regions = df_1.loc[df_1['country'] == cou, 'region_1']
  to_rename = pd.value_counts(regions)[pd.value_counts(regions) < limit]
  for el in to_rename.index:
    df_1.loc[df_1['region_1'] == el,'region_1'] = f'{cou} other'

In [183]:
# VARIETY CLEANING
limit = 20
for var in df['variety'].unique():
    if len(df[df['variety'] == var]) < limit:
        df.loc[df['variety'] == var, 'variety'] = 'Variety other'


Preprocessing on winery

In [184]:
# number of wine per winery
winery_count = pd.value_counts(df["winery"])
df['winery_count'] = winery_count[df['winery']].values

In [185]:
#number of variety produced per winery
df['var_x_win'] = df.groupby(['winery']).variety.nunique()[df['winery']].values

In [186]:
#number of wine of that variety produced by that winery
# number of wine per variety produced by that winery 
wine_var_winery = {}
for win in df['winery'].unique():
  for var in df[df['winery'] == win]['variety'].unique():
    wine_var_winery[f'{win}_{var}'] = len(df[ (df['winery'] == win) & (df['variety'] == var) ])

win = []
for el1,el2 in zip(df['winery'], df['variety']):
  win.append(wine_var_winery[f'{el1}_{el2}'])
df['wine_x_var_x_win'] = win
df['ratio_variety_x_winery'] = df['wine_x_var_x_win']/df['winery_count']

Preprocessing on designation

In [187]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
# import nltk
# nltk.download()

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [188]:
class LemmaTokenizer(object):
  def __init__(self):
    self.lemmatizer = WordNetLemmatizer()

  def __call__(self, document):
    lemmas = []
    for t in word_tokenize(document):
      t = t.strip()
      lemma = self.lemmatizer.lemmatize(t)
      lemmas.append(lemma)
    return lemmas

stop_words = stopwords.words('english')

train_valid_mask = ~df["quality"].isna()

lemmaTokenizer = LemmaTokenizer()
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer, stop_words=stop_words, binary=True, use_idf=False, norm=False)
# vectorizer_2 = TfidfVectorizer(tokenizer=lemmaTokenizer, stop_words=stop_words, binary=True, use_idf=True, norm=False)
tfidf_train_desi = vectorizer.fit_transform(df.loc[train_valid_mask,'designation'].fillna(''))

N = 120
freq = sorted(zip(vectorizer.get_feature_names(), tfidf_train_desi.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:N]
freq 

tfidf_df = vectorizer.transform(df['designation'].fillna(''))

# mask to be used to filter columns in wpm (only keeps the ones for the 100 most frequent words)
words = [ word for word, _ in freq ]
mask = [ w in words for w in vectorizer.get_feature_names()]
words_ = [ w for w in vectorizer.get_feature_names() if w in words ]
words_df1 = pd.DataFrame(data = tfidf_df[:, np.array(mask)].toarray(), columns=[f"word_{word}" for word in words_], index=df.index)


df_word = df.join(words_df1)


  'stop_words.' % sorted(inconsistent))


One hot encoding

In [189]:
from scipy import sparse
# df_1 = pd.get_dummies(df_1, columns=['province','country', 'variety'], prefix = '', prefix_sep = '')
df_1 = df_word

sample_province = list(pd.value_counts(df_1["province"])[-1:-356:-1].index)
# sample_region_1 = list(pd.value_counts(df_1["region_1"])[-1:-1482:-1].index)
# sample_variety = list(pd.value_counts(df_1["variety"])[-1:-532:-1].index)
# sample_country = list(pd.value_counts(df_1["country"])[-1:-34:-1].index)
# sample_winery = list(pd.value_counts(df_1["winery"])[-1:-8809:-1].index) 
# sample_winery = ['winery'+win for win in sample_winery]

df_1 = pd.get_dummies(df_1, columns= ['region_1', 'region_2', 'variety', 'province'],prefix='', prefix_sep = '')

df_1.drop(columns = sample_province, inplace = True)
# df_1.drop(columns = sample_region_1, inplace = True, errors='ignore')
# df_1.drop(columns = sample_variety, inplace = True, errors='ignore')
# df_1.drop(columns = sample_winery, inplace = True, errors='ignore')

In [190]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
df_dropped = df_1.drop(columns = ['designation', 'country', 'winery','description'])
train_valid_mask = ~df_dropped["quality"].isna()
# extract the feature names (for later use)
feature_names = df_dropped[train_valid_mask].drop(columns=["quality"]).columns
X = df_dropped.drop(columns=["quality"]).values
y = df_dropped["quality"].values
X_train_valid = X[train_valid_mask]
y_train_valid = y[train_valid_mask]
X_test = X[~train_valid_mask]
y_test = y[~train_valid_mask]
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid,y_train_valid, shuffle=True, random_state=42)

In [192]:
reg = RandomForestRegressor(150, random_state=42)
reg.fit(X_train , y_train)
r2_score(y_valid, reg.predict(X_valid))

In [193]:
#Prediction on test set
reg = RandomForestRegressor(n_estimators=150, random_state=42)
reg.fit(X_train_valid , y_train_valid)
y_pred = reg.predict(X_test)
pd.DataFrame(y_pred, index=df_eval.index).to_csv("output.csv",index_label="Id", header=["Predicted"])