In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import math
from pyspark.ml.feature import VectorAssembler
from sklearn.naive_bayes import MultinomialNB
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("NaiveBayes")\
    .getOrCreate()

In [3]:
training_data = spark.read.csv('../train.csv', header=True, inferSchema=True)
cols_to_drop = ['_c0','id']
training_data = training_data.drop(*cols_to_drop)
training_data = training_data.toPandas()
train = pd.DataFrame(training_data)


test_data = spark.read.csv('../test.csv', header=True, inferSchema=True)
cols_to_drop = ['_c0','id']
test_data = test_data.drop(*cols_to_drop)
test_data = test_data.toPandas()
test = pd.DataFrame(test_data)

In [4]:
numerical_columns = ['Age', 'Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes']
nominal_columns = ['Gender', 'Customer Type','Type of Travel', 'Class','satisfaction']

In [5]:
# drop rows with nulls values
train = train.dropna()

# encode nominal features
train_mappings = {}

for col in nominal_columns:
    encoder = LabelEncoder()
    train[col] = encoder.fit_transform(train[col])
    mapping = {category: label for category, label in zip(encoder.classes_, encoder.transform(encoder.classes_))}
    train_mappings[col] = mapping

print(train_mappings)


# scaling numerical features
for col in numerical_columns:
#     scaler = StandardScaler()
#     train[col] = scaler.fit_transform(train[[col]])
        train[col] = pd.cut(train[col], bins=10, labels=False)

columns_to_convert = [col for col in train.columns if col != 'satisfaction']

for col in columns_to_convert:
    dummies = pd.get_dummies(train[col], prefix= col)
    dummies = dummies.astype(int)
    train = pd.concat([train, dummies], axis=1)
    train.drop(col, axis=1, inplace=True)


train = train[[col for col in train.columns if col != 'satisfaction'] + ['satisfaction']]


train

{'Gender': {'Female': 0, 'Male': 1}, 'Customer Type': {'Loyal Customer': 0, 'disloyal Customer': 1}, 'Type of Travel': {'Business travel': 0, 'Personal Travel': 1}, 'Class': {'Business': 0, 'Eco': 1, 'Eco Plus': 2}, 'satisfaction': {'neutral or dissatisfied': 0, 'satisfied': 1}}


Unnamed: 0,Gender_0,Gender_1,Customer Type_0,Customer Type_1,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,...,Arrival Delay in Minutes_0,Arrival Delay in Minutes_1,Arrival Delay in Minutes_2,Arrival Delay in Minutes_3,Arrival Delay in Minutes_4,Arrival Delay in Minutes_5,Arrival Delay in Minutes_6,Arrival Delay in Minutes_8,Arrival Delay in Minutes_9,satisfaction
0,0,1,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,1,0,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,1,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
103900,0,1,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
103901,0,1,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
103902,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
# drop rows with nulls values
test = test.dropna()

# encode nominal features
test_mappings = {}

for col in nominal_columns:
    encoder = LabelEncoder()
    test[col] = encoder.fit_transform(test[col])
    mapping = {category: label for category, label in zip(encoder.classes_, encoder.transform(encoder.classes_))}
    test_mappings[col] = mapping

print(test_mappings)


# scaling numerical features
for col in numerical_columns:
    # scaler = StandardScaler()
    # test[col] = scaler.fit_transform(test[[col]])
    test[col] = pd.cut(test[col], bins=10, labels=False)

columns_to_convert = [col for col in test.columns if col != 'satisfaction']

for col in columns_to_convert:
    dummies = pd.get_dummies(test[col], prefix=  col)
    dummies = dummies.astype(int)
    test = pd.concat([test, dummies], axis=1)
    test.drop(col, axis=1, inplace=True)


test = test[[col for col in test.columns if col != 'satisfaction'] + ['satisfaction']]


test

{'Gender': {'Female': 0, 'Male': 1}, 'Customer Type': {'Loyal Customer': 0, 'disloyal Customer': 1}, 'Type of Travel': {'Business travel': 0, 'Personal Travel': 1}, 'Class': {'Business': 0, 'Eco': 1, 'Eco Plus': 2}, 'satisfaction': {'neutral or dissatisfied': 0, 'satisfied': 1}}


Unnamed: 0,Gender_0,Gender_1,Customer Type_0,Customer Type_1,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,...,Arrival Delay in Minutes_1,Arrival Delay in Minutes_2,Arrival Delay in Minutes_3,Arrival Delay in Minutes_4,Arrival Delay in Minutes_5,Arrival Delay in Minutes_6,Arrival Delay in Minutes_7,Arrival Delay in Minutes_8,Arrival Delay in Minutes_9,satisfaction
0,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,0,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
25972,0,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
25973,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25974,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
train, test = train.align(test, join='outer', axis=1, fill_value=0)

In [8]:
train_spark = spark.createDataFrame(train)
test_spark = spark.createDataFrame(test)

In [9]:
FEATURES_COL = train_spark.columns[:-1]
LABEL_COL = train_spark.columns[-1]

In [10]:
y_train_2 = train['satisfaction']
train_2 = train.drop('satisfaction', axis=1)

y_test_2 = test['satisfaction']
test_2 = test.drop('satisfaction', axis=1)


In [16]:
# Initialize the MultinomialNB classifier
clf = MultinomialNB(alpha = 1.0e-10)

# Fit the model to your data
clf.fit(train_2, y_train_2)

y_pred_train = clf.predict(train_2)
accuracy = accuracy_score(y_train_2, y_pred_train)
print("Train Accuracy:", accuracy)

y_pred = clf.predict(test_2)
accuracy = accuracy_score(y_test_2, y_pred)
print("Test Accuracy:", accuracy)

Train Accuracy: 0.8935942236036836
Test Accuracy: 0.8900088827096126


In [12]:
assembler = VectorAssembler(inputCols=FEATURES_COL, outputCol='features')
df_train = assembler.transform(train_spark)

assembler = VectorAssembler(inputCols=FEATURES_COL, outputCol='features')
df_test = assembler.transform(test_spark)

In [13]:
rf = NaiveBayes(labelCol=LABEL_COL, featuresCol="features", modelType="multinomial", smoothing= 0)

model = rf.fit(df_train)

# train
predictions = model.transform(df_train)

evaluator = MulticlassClassificationEvaluator(labelCol="satisfaction", predictionCol="prediction",
                                                metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Train set accuracy = " + str(accuracy))

# test
predictions = model.transform(df_test)

evaluator = MulticlassClassificationEvaluator(labelCol="satisfaction", predictionCol="prediction",
                                                metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Train set accuracy = 0.8935942236036836
Test set accuracy = 0.8900475031861893
