In [58]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import ast

In [59]:
train_data = pd.read_csv('/content/train.csv', on_bad_lines='skip')

In [60]:
print(train_data.info())
print(train_data.describe())
print(train_data.head())
print(train_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37500 entries, 0 to 37499
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              37500 non-null  int64  
 1   beer/ABV           37500 non-null  float64
 2   beer/beerId        37500 non-null  int64  
 3   beer/brewerId      37500 non-null  int64  
 4   beer/name          37500 non-null  object 
 5   beer/style         37500 non-null  object 
 6   review/appearance  37500 non-null  float64
 7   review/aroma       37500 non-null  float64
 8   review/overall     37500 non-null  float64
 9   review/palate      37500 non-null  float64
 10  review/taste       37500 non-null  float64
 11  review/text        37490 non-null  object 
 12  review/timeStruct  37500 non-null  object 
 13  review/timeUnix    37500 non-null  int64  
 14  user/ageInSeconds  7856 non-null   float64
 15  user/birthdayRaw   7856 non-null   object 
 16  user/birthdayUnix  785

In [61]:
train_data.drop(['user/profileName', 'beer/name', 'user/gender', 'review/timeStruct',
                 'user/birthdayRaw', 'beer/brewerId', 'user/ageInSeconds', 'beer/beerId',
                 'review/timeUnix', 'user/birthdayUnix'], axis=1, inplace=True)
train_data['beer/ABV'].fillna(train_data['beer/ABV'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['beer/ABV'].fillna(train_data['beer/ABV'].median(), inplace=True)


In [62]:
if 'review/timeStruct' in train_data.columns:
    train_data.drop(['review/timeStruct'], axis=1, inplace=True)

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
train_data['review/text'] = train_data['review/text'].fillna('')
tfidf = TfidfVectorizer(max_features=500)
text_features = tfidf.fit_transform(train_data['review/text']).toarray()


In [66]:
train_data['beer/style'] = pd.Categorical(train_data['beer/style']).codes
train_data = pd.concat([train_data, pd.DataFrame(text_features)], axis=1)
train_data.drop(['review/text'], axis=1, inplace=True)

In [67]:
numerical_columns = ['beer/ABV', 'review/appearance', 'review/aroma', 'review/palate', 'review/taste']
scaler = StandardScaler()
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])

In [68]:
X = train_data.drop('review/overall', axis=1)
y = train_data['review/overall']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [70]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [71]:
rf_model.fit(X_train, y_train)

In [72]:
y_pred = rf_model.predict(X_test)

In [73]:
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'R^2 Score: {r2}')

MAE: 0.30104600000000004
RMSE: 0.39430595988394596
R^2 Score: 0.6839365055877111




LINEAR REGRESSION

In [75]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()

In [76]:
lr_model.fit(X_train, y_train)

In [79]:
y_pred_lr = lr_model.predict(X_test)

In [80]:
mae = mean_absolute_error(y_test, y_pred_lr)
rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
r2 = r2_score(y_test, y_pred_lr)

print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'R^2 Score: {r2}')

MAE: 0.2981224858576791
RMSE: 0.3907818753910019
R^2 Score: 0.689560853986467




GRADIENT BOOSTING

In [81]:
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

In [82]:
gb_model.fit(X_train, y_train)

In [83]:
y_pred_gb = gb_model.predict(X_test)

In [84]:
mae = mean_absolute_error(y_test, y_pred_gb)
rmse = mean_squared_error(y_test, y_pred_gb, squared=False)
r2 = r2_score(y_test, y_pred_gb)

print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'R^2 Score: {r2}')

MAE: 0.29601127016115947
RMSE: 0.3885862733299516
R^2 Score: 0.6930394495388327


