In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif

warnings.filterwarnings('ignore')

In [None]:
""""
# coffee data
url="https://github.com/jldbc/coffee-quality-database/raw/master/data/robusta_data_cleaned.csv"
coffee_features=pd.read_csv(url)

# coffe score

url="https://raw.githubusercontent.com/jldbc/coffee-quality-database/master/data/robusta_ratings_raw.csv"
coffee_quality=pd.read_csv(url)
coffee_quality.head()

Y = coffee_quality["quality_score"]
coffee_features.info()
#for this exercise we will only deal with numeric variables

X = coffee_features.select_dtypes(['number'])
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
X_train["altitude_mean_log"] = np.log(X_train["altitude_mean_meters"])
X_train.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_train.drop(['Quakers'], axis=1, inplace=True)
X_train.drop(['Unnamed: 0'], axis=1, inplace=True)

X_train.info()

altitude_low_meters_mean = X_train["altitude_low_meters"].mean()
altitude_high_meters_mean = X_train["altitude_high_meters"].mean()
altitude_mean_log_mean = X_train["altitude_mean_log"].mean()

# fillna with mean.. 
X_train["altitude_low_meters"] = X_train["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_train["altitude_high_meters"] = X_train["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_train["altitude_mean_log"] = X_train["altitude_mean_log"].fillna(altitude_mean_log_mean)

print(f"altitude low meters mean is {altitude_low_meters_mean}")
print(f"altitude_high_meters_mean is {altitude_high_meters_mean}")
print(f"altitude_mean_log_mean is {altitude_mean_log_mean}")

## in order to exemplify how the predict will work.. we will save the y_train
X_test.to_csv("data/X_test.csv")
y_test.to_csv("data/y_test.csv")

#training the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

from sklearn.metrics import mean_squared_error
y_train_pred = reg.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
print(mse)

#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
X_test["altitude_mean_log"] = np.log(X_test["altitude_mean_meters"])
X_test.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_test.drop(['Quakers'], axis=1, inplace=True)
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
# fillna with mean.. 
X_test["altitude_low_meters"] = X_test["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_test["altitude_high_meters"] = X_test["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_test["altitude_mean_log"] = X_test["altitude_mean_log"].fillna(altitude_mean_log_mean)

y_test_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
print(mse)

"""

## Data Exploration

In [None]:
df = pd.read_csv('./data/Train.csv')
pd.set_option('display.max_rows', None)
df.head(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().round(2).T

In [None]:
# There are 340 locations
df.Place_ID.nunique()

In [None]:
df.Date.groupby(df.Place_ID).nunique().sort_values(ascending=False)
# Number of dates measured for each Place ID

In [None]:
# Checking for 0 values
df_pp = df.copy()
missing = pd.DataFrame((df_pp==0).sum(), columns=["Zero_Amount"])
missing.head()
missing['Percentage'] = round((missing['Zero_Amount']/df.shape[0])*100, 2)
missing[missing['Zero_Amount'] != 0].sort_values(by = 'Percentage', ascending=False)


In [None]:
df.columns

# Data cleaningin and feature engineering

In [None]:
# Changing the Date column to a datetime variable
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

# We will create a new columns with the frequency of the places
df['PlaceID_freq'] = df['Place_ID'].map(df['Place_ID'].value_counts())

# No feature columns
no_features = ['Place_ID X Date', 'Date', 'Place_ID', 'target', 'target_min',
               'target_max', 'target_variance', 'target_count', 'PlaceID_freq']

features = [f for f in df.columns if f not in no_features] # This columns help to identify
                                                            # the data, not measurements

In [None]:
df

In [None]:
df.head()

## Splitting data for testing 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
  ('classification', RandomForestClassifier())
])
clf.fit(X, y)

## Trainining the model