# Feature Enginering

In [1]:
import pandas as pd
import numpy as np

In [28]:
# Import the preprocessed dataset
file_location = "finaldata.csv"
data = pd.read_csv(file_location)
data

Unnamed: 0.1,Unnamed: 0,index,country,points,price,variety
0,0,2,US,96,90.0,Sauvignon Blanc
1,1,3,US,96,65.0,Pinot Noir
2,2,8,US,95,65.0,Pinot Noir
3,3,9,US,95,60.0,Pinot Noir
4,4,11,US,95,48.0,Pinot Noir
...,...,...,...,...,...,...
93467,93467,150911,France,87,37.0,Pinot Noir
93468,93468,150912,France,87,65.0,Pinot Noir
93469,93469,150915,US,93,30.0,White Blend
93470,93470,150925,Italy,91,20.0,White Blend


In [29]:
# Remove unnecessary columns
cols = ['country','points','price','variety']
modified_data = data[cols]
modified_data

Unnamed: 0,country,points,price,variety
0,US,96,90.0,Sauvignon Blanc
1,US,96,65.0,Pinot Noir
2,US,95,65.0,Pinot Noir
3,US,95,60.0,Pinot Noir
4,US,95,48.0,Pinot Noir
...,...,...,...,...
93467,France,87,37.0,Pinot Noir
93468,France,87,65.0,Pinot Noir
93469,US,93,30.0,White Blend
93470,Italy,91,20.0,White Blend


In [30]:
# Move the Predicting column to the first column
label = 'points'
colIdx = modified_data.columns.get_loc(label)

# Do nothing if the label is in the 0th position
# Otherwise, change the order of columns to move label to 0th position
if colIdx != 0:
    cols = cols[colIdx:colIdx+1] + cols[0:colIdx] + cols[colIdx+1:]
cols

['points', 'country', 'price', 'variety']

In [31]:
final_data = modified_data[cols]
final_data

Unnamed: 0,points,country,price,variety
0,96,US,90.0,Sauvignon Blanc
1,96,US,65.0,Pinot Noir
2,95,US,65.0,Pinot Noir
3,95,US,60.0,Pinot Noir
4,95,US,48.0,Pinot Noir
...,...,...,...,...
93467,87,France,37.0,Pinot Noir
93468,87,France,65.0,Pinot Noir
93469,93,US,30.0,White Blend
93470,91,Italy,20.0,White Blend


In [32]:
# One hot encode and fill missing values
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [33]:
# Remove label so that it is not encoded
data_without_label = final_data.drop([label], axis=1)
#data_without_label

In [34]:
# Fills missing values with the median value for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

In [35]:
# one hot encoding on the categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
#handle_unknown means it will replace the unknown value with 0

In [36]:
numeric_features = data_without_label.select_dtypes(include=['int64',
                                                    'float64']).columns
#numeric_features

In [37]:
categorical_features = data_without_label.select_dtypes(exclude=['int64',
                                                            'float64']).columns
#categorical_features

In [38]:
# Create the column transformer
preprocessor_cols = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

In [39]:
# Create a pipeline with the column transformer, note that
# more things can be added to this pipeline in the future
preprocessor = Pipeline(steps=[('preprocessor', preprocessor_cols)])
preprocessor.fit(data_without_label)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [40]:
final_data_without_label = preprocessor.transform(data_without_label)
final_data_without_label

<93472x31 sparse matrix of type '<class 'numpy.float64'>'
	with 280416 stored elements in Compressed Sparse Row format>

In [41]:
if (type(final_data_without_label) is not np.ndarray):
    final_data_without_label = final_data_without_label.toarray()

In [42]:
final_data_array = np.concatenate(
    (np.array(final_data[label]).reshape(-1, 1),
     final_data_without_label), axis=1)
final_data_array

array([[96., 90.,  0., ...,  0.,  0.,  0.],
       [96., 65.,  0., ...,  0.,  0.,  0.],
       [95., 65.,  0., ...,  0.,  0.,  0.],
       ...,
       [93., 30.,  0., ...,  0.,  1.,  0.],
       [91., 20.,  0., ...,  0.,  1.,  0.],
       [91., 20.,  0., ...,  0.,  1.,  0.]])

In [43]:
# Save the processed file, please change preicison in fmt as needed
np.savetxt("data_processed.csv", final_data_array, delimiter=",", fmt='%1.3f')

In [44]:
# Split the file into train and test (80% train and 20% test)
from sklearn.model_selection import train_test_split
train, test= train_test_split(final_data_array, test_size=0.2)

In [45]:
# Save the train file, please change preicison in fmt as needed
np.savetxt("train.csv", train, delimiter=",", fmt='%1.3f')

# Save the test file, please change preicison in fmt as needed
np.savetxt("test.csv", test, delimiter=",", fmt='%1.3f')