**Project - Winter session - Maria Margherita Lovera (s278425)**

Libraries and functions

In [47]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import random
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR

# fill designation NaNs with the most frequent designation for each province/country and variety of wine
def fill_NaN_des(df, designation):
  count = 1
  rows = len(df.index)
  count_dict = {}
  for index, row in df.iterrows():
    print(f"\rGenerating count dictionary for province and variety ({100 * float(count) / rows}%)", end='')
    if pd.notna(row[designation]):
      key_list = [row[designation], row['province'], row['variety']]
      key = tuple(key_list)
      if key in count_dict.keys():
        count_dict[key] += 1
      else:
        count_dict[key] = 1

    count += 1

  print()
  count = 1
  for index, row in df.iterrows():
    print(f"\rFilling {designation} NaNs with respect to province and variety: ({100 * float(count) / rows}%)", end='')
    if pd.isna(row[designation]):
      province = row['province']
      variety = row['variety']
      max_value = 0
      max_key = tuple()
      for key in count_dict.keys():
        if (province in key) and (variety in key):
          if count_dict[key] > max_value:
            max_value = count_dict[key]
            max_key = key

      if len(max_key) != 0:
        selected_designation = max_key[0]
        df.at[index, designation] = selected_designation

    count += 1
  print()
  print("Done.")

  count = 1
  rows = len(df.index)
  count_dict = {}
  for index, row in df.iterrows():
    print(f"\rGenerating count dictionary for country and variety ({100 * float(count) / rows}%)", end='')
    if pd.notna(row[designation]):
      key_list = [row[designation], row['country'], row['variety']]
      key = tuple(key_list)
      if key in count_dict.keys():
        count_dict[key] += 1
      else:
        count_dict[key] = 1

    count += 1

  print()
  count = 1
  for index, row in df.iterrows():
    print(f"\rFilling remaining {designation} NaNs with respect to country and variety: ({100 * float(count) / rows}%)", end='')
    if pd.isna(row[designation]):
      country = row['country']
      variety = row['variety']
      max_value = 0
      max_key = tuple()
      for key in count_dict.keys():
        if (country in key) and (variety in key):
          if count_dict[key] > max_value:
            max_value = count_dict[key]
            max_key = key

      if len(max_key) != 0:
        selected_designation = max_key[0]
        df.at[index, designation] = selected_designation

    count += 1
  print()
  print("Done.")


# fill region_1 NaNs with the most frequent with respect to province and variety of wine
def fill_NaN_reg1(df, region):
  count = 1
  rows = len(df.index)
  count_dict = {}
  for index, row in df.iterrows():
    print(f"\rGenerating count dictionary for province and variety ({100 * float(count) / rows}%)", end='')
    if pd.notna(row[region]):
      key_list = [row[region], row['province'], row['variety']]
      key = tuple(key_list)
      if key in count_dict.keys():
        count_dict[key] += 1
      else:
        count_dict[key] = 1

    count += 1

  print()
  count = 1
  for index, row in df.iterrows():
    print(f"\rFilling {region} NaNs with respect to provinces and varieties: ({100 * float(count) / rows}%)", end='')
    if pd.isna(row[region]):
      province = row['province']
      variety = row['variety']
      max_value = 0
      max_key = tuple()
      for key in count_dict.keys():
        if (province in key) and (variety in key):
          if count_dict[key] > max_value:
            max_value = count_dict[key]
            max_key = key

      if len(max_key) != 0:
        selected_region = max_key[0]
        df.at[index, region] = selected_region

    count += 1

  print()
  print(f'We still have ' + str(df.isna().region_1.sum()) + ' NaNs to fill.')

  count = 1
  rows = len(df.index)
  count_dict = {}
  for index, row in df.iterrows():
    print(f"\rGenerating count dictionary for provinces ({100 * float(count) / rows}%)", end='')
    if pd.notna(row[region]):
      key_list = [row[region], row['province']]
      key = tuple(key_list)
      if key in count_dict.keys():
        count_dict[key] += 1
      else:
        count_dict[key] = 1

    count += 1

  print()

  count = 1
  for index, row in df.iterrows():
    print(f"\rFilling {region} NaNs with respect to provinces: ({100 * float(count) / rows}%)", end='')
    if pd.isna(row[region]):
      province = row['province']
      max_value = 0
      max_key = tuple()
      for key in count_dict.keys():
        if (province in key):
          if count_dict[key] > max_value:
            max_value = count_dict[key]
            max_key = key

      if len(max_key) != 0:
        selected_region = max_key[0]
        df.at[index, region] = selected_region

    count += 1

  print()
  print(f'We still have ' + str(df.isna().region_1.sum()) + ' NaNs to fill.')
  
  count = 1
  for index, row in df.iterrows():
    print(f"\rFilling {region} NaNs with correspondent province when no informations are given: ({100 * float(count) / rows}%)", end='')
    if pd.isna(row[region]):
      province = row['province']
      df.at[index, 'region_1'] = province
    count += 1
  print()
  print("Done.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing

In [48]:
# import dataframe
data = pd.read_csv("dev.tsv", sep = '\t')

# drop rows with NaNs on both country and province
ind = data.loc[pd.isna(data["country"]), :].index
data = data.drop(ind).reset_index()
data = data.drop(columns=['index'], axis = 1)

# fill NaNs of designation
print('Designation NaN filling')
fill_NaN_des(data, 'designation')

# drop rows with remaining NaNs on designation
ind = data.loc[data['designation'].isna() == True].index
data = data.drop(ind).reset_index()
data = data.drop(columns=['index'], axis = 1)

# fill NaNs of region_1
print('Regions of group one NaN filling')
fill_NaN_reg1(data, 'region_1')

Generating count dictionary for province and variety (100.0%)
Filling designation NaNs with respect to province and variety: (100.0%)
Done.
Generating count dictionary for country and variety (100.0%)
Filling remaining designation NaNs with respect to country and variety: (100.0%)
Done.
Generating count dictionary for province and variety (100.0%)
Filling region_1 NaNs with respect to provinces and varieties: (100.0%)
We still have 19028 NaNs to fill.
Generating count dictionary for provinces (100.0%)
Filling region_1 NaNs with respect to provinces: (100.0%)
We still have 18575 NaNs to fill.
Filling region_1 NaNs with correspondent province when no informations are given: (100.0%)
Done.


Model implementation

In [49]:
random.seed(1000)

print('Preparing data:', end='')
df = data[['description', 'winery', 'designation', 'region_1', 'variety']]
target = data['quality']

# train-test split for analysis
X_train,X_test,y_train,y_test = train_test_split(df, target, test_size=0.20, random_state = 4)

# Tf-idf on descriptions
stop_words = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(ngram_range=(1,2), analyzer = 'word', lowercase=True, stop_words = stop_words)
print()
print("Extracting tf-idf features of training set...", end="")
X_train_descriptions = vectorizer.fit_transform(X_train['description'])
print(f"Done. Number of extracted features: {X_train_descriptions.shape[1]}")
print("Extracting tf-idf features of test set...", end="")
X_test_descriptions = vectorizer.transform(X_test['description'])
print('Done.')

# wineries
print("Binarization of remaining features...", end="")
cvectw = CountVectorizer(lowercase = False, ngram_range = (1,2), binary = True)
X_train_wineries = cvectw.fit_transform(X_train['winery'])
X_test_wineries = cvectw.transform(X_test['winery'])

# designations
cvectd = CountVectorizer(lowercase = False, ngram_range = (1,2), binary = True)
X_train_designations = cvectd.fit_transform(X_train['designation'])
X_test_designations = cvectd.transform(X_test['designation'])

# region_1
cvectr = CountVectorizer(lowercase = False, ngram_range = (1,2), binary = True)
X_train_regions = cvectr.fit_transform(X_train['region_1'])
X_test_regions = cvectr.transform(X_test['region_1'])

# add varieties
cvectv = CountVectorizer(lowercase = False, ngram_range = (1,2), binary = True)
X_train_varieties = cvectv.fit_transform(X_train['variety'])
X_test_varieties = cvectv.transform(X_test['variety'])
print('Done.')

# make data for regression 
print('Construction of train set and test set for regression...', end='')
X_train_reg = scipy.sparse.hstack((X_train_descriptions, X_train_wineries, X_train_designations, X_train_regions, X_train_varieties))
X_test_reg = scipy.sparse.hstack((X_test_descriptions, X_test_wineries, X_test_designations, X_test_regions, X_test_varieties))

print('Done.')



Preparing data:
Extracting tf-idf features of training set...Done. Number of extracted features: 612425
Extracting tf-idf features of test set...Done.
Binarization of remaining features...Done.
Construction of train set and test set for regression...Done.


In [50]:
# linear svr regression
print('Starting LinearSVR regression...', end='')
SVRreg = LinearSVR(C = 10, epsilon = 0.8, max_iter = 3000)
SVRreg.fit(X_train_reg, y_train)
SVRy_pred = SVRreg.predict(X_test_reg)
print('Done. R2 score: ' + str(r2_score(y_test, SVRy_pred)))

Starting LinearSVR regression...Done. R2 score: 0.8612190309457975


Model implementation on evaluation set 

In [51]:
# import dataframe
data_eval = pd.read_csv("eval.tsv", sep = '\t')

# fill NaNs of designation
print('Designation NaN filling')
fill_NaN_des(data_eval, 'designation')

# convert remaining NaNs of designation into strings
# here it is not possible to drop rows since we are evaluating new data
for index, row in data_eval.iterrows():
  if pd.isna(row['designation']):
    data_eval.at[index, 'designation'] = 'NaN'

# fill NaNs of region_1
print('Regions of group one NaN filling')
fill_NaN_reg1(data_eval, 'region_1')


################# data preprocessing for model implementation #####################
random.seed(1100)
print('Preparing data...', end='')
dataframe = data[['description', 'winery', 'designation', 'region_1', 'variety']]
dataframe_eval = data_eval[['description', 'winery', 'designation', 'region_1', 'variety']]
target = data['quality']

# Tf-idf on descriptions
vectorizerdef = TfidfVectorizer(ngram_range=(1,2), analyzer = 'word', lowercase=True, stop_words = stop_words)
print("Extracting tf-idf features of dev set...", end="")
X_descriptions = vectorizerdef.fit_transform(dataframe['description'])
print(f"Done. Number of extracted features (unique stemmed words): {X_descriptions.shape[1]}")
print("Extracting tf-idf features of eval set...", end="")
X_descriptions_eval = vectorizerdef.transform(dataframe_eval['description'])
print('Done.')

# wineries
print("Binarization of remaining features...", end="")
cvectwdef = CountVectorizer(lowercase = False, ngram_range = (1,2), binary = True)
X_wineries = cvectwdef.fit_transform(dataframe['winery'])
X_wineries_eval = cvectwdef.transform(dataframe_eval['winery'])

# designations
cvectddef = CountVectorizer(lowercase = False, ngram_range = (1,2), binary = True)
X_designations = cvectddef.fit_transform(dataframe['designation'])
X_designations_eval = cvectddef.transform(dataframe_eval['designation'])

# region_1
cvectrdef = CountVectorizer(lowercase = False, ngram_range = (1,2), binary = True)
X_regions = cvectrdef.fit_transform(dataframe['region_1'])
X_regions_eval = cvectrdef.transform(dataframe_eval['region_1'])

# varieties
cvectvdef = CountVectorizer(lowercase = False, ngram_range = (1,2), binary = True)
X_varieties = cvectvdef.fit_transform(dataframe['variety'])
X_varieties_eval = cvectvdef.transform(dataframe_eval['variety'])
print('Done.')


# make data for regression 
print('Construction of dev set and eval set for regression...', end='')
X_def = scipy.sparse.hstack((X_descriptions, X_wineries, X_designations, X_regions, X_varieties))
X_def_eval = scipy.sparse.hstack((X_descriptions_eval, X_wineries_eval, X_designations_eval, X_regions_eval, X_varieties_eval))
print('Done.')

# linear svr regression
print("Starting regression...", end="")
linreg_def = LinearSVR(C=10, epsilon = 0.8, max_iter = 3000).fit(X_def, target)
y_pred_def = linreg_def.predict(X_def_eval)
print('Done.')

# sample submission
id = np.arange(len(y_pred_def))
sub = {'Id': id, 'Predicted': y_pred_def}
submission = pd.DataFrame(sub)
# submission.to_csv('sample_submission.csv', index = False) 

Designation NaN filling
Generating count dictionary for province and variety (100.0%)
Filling designation NaNs with respect to province and variety: (100.0%)
Done.
Generating count dictionary for country and variety (100.0%)
Filling remaining designation NaNs with respect to country and variety: (100.0%)
Done.
Regions of group one NaN filling
Generating count dictionary for province and variety (100.0%)
Filling region_1 NaNs with respect to provinces and varieties: (100.0%)
We still have 4871 NaNs to fill.
Generating count dictionary for provinces (100.0%)
Filling region_1 NaNs with respect to provinces: (100.0%)
We still have 4862 NaNs to fill.
Filling region_1 NaNs with correspondent province when no informations are given: (100.0%)
Done.
Preparing data...Extracting tf-idf features of dev set...Done. Number of extracted features (unique stemmed words): 680482
Extracting tf-idf features of eval set...Binarization of remaining features...Done.
Construction of dev set and eval set for r