### Vectorization

Lets vectorize the processed data and store our vectorizers in pickle file. After that we will apply various ML models and do cross validation as next steps.

In [48]:
import pandas as pd 
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [4]:
processed = pd.read_csv('../data/processed/processed.csv')

In [57]:
processed['num_code'] = processed['num_code'].apply(str)
processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300261 entries, 0 to 300260
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   airline         300261 non-null  object
 1   ch_code         300261 non-null  object
 2   num_code        300261 non-null  object
 3   from            300261 non-null  object
 4   time_taken      300261 non-null  int64 
 5   stop            300261 non-null  object
 6   to              300261 non-null  object
 7   price           300261 non-null  int64 
 8   type            300261 non-null  object
 9   days_left       300261 non-null  int64 
 10  dep_time_phase  300261 non-null  object
 11  arr_time_phase  300261 non-null  object
dtypes: int64(3), object(9)
memory usage: 27.5+ MB


In [58]:
categorical_features = ['airline', 'ch_code', 'from', 'stop', 'to', 'type', 'dep_time_phase', 'arr_time_phase']
numerical_features = ['time_taken', 'days_left', 'price']

cat_pipeline = Pipeline(steps = [("ohe", OneHotEncoder())])
num_pipeline = Pipeline(steps = [("imputer", SimpleImputer(strategy="most_frequent"))])

vectorizer = ColumnTransformer([("cat_piplines", cat_pipeline, categorical_features), ("num_pipeline", num_pipeline, numerical_features)])

vectorized_data = vectorizer.fit_transform(processed)

print('Data is vectorized')

print('Shape of data after vectorization: {0}'.format(vectorized_data.shape))

Data is vectorized
Shape of data after vectorization: (300261, 46)


In [59]:
vectorizer_file = "../models/vectorizer.pkl"
with open(vectorizer_file, 'wb') as f:
    pickle.dump(vectorizer, f)
print('Dumped the vectorizer in {} file'.format(vectorizer_file))

Dumped the vectorizer in ../models/vectorizer.pkl file


In [68]:
import os
processed_data_folder = "../data/processed"

vectorized_data_arr = vectorized_data.toarray() #.to_csv(processed_data_folder+"/vectorized_data.csv", index=False)
np.savetxt(processed_data_folder+"/vectorized_data.csv",vectorized_data_arr, fmt='%s', delimiter=',')
print('Vectorized data exported.')
print('List of files under ../data/processed: {0}'.format(os.listdir(processed_data_folder)))

Vectorized data exported.
List of files under ../data/processed: ['.gitkeep', 'processed.csv', 'vectorized_data.csv']
