In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import math
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("RandomForest")\
    .getOrCreate()

In [3]:
training_data = spark.read.csv('/content/train.csv', header=True, inferSchema=True)
cols_to_drop = ['_c0','id']
training_data = training_data.drop(*cols_to_drop)
training_data = training_data.toPandas()
train = pd.DataFrame(training_data)


test_data = spark.read.csv('/content/test.csv', header=True, inferSchema=True)
cols_to_drop = ['_c0','id']
test_data = test_data.drop(*cols_to_drop)
test_data = test_data.toPandas()
test = pd.DataFrame(test_data)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/train.csv.

In [None]:
numerical_columns = ['Age', 'Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes']
nominal_columns = ['Gender', 'Customer Type','Type of Travel', 'Class','satisfaction']

In [None]:
# drop rows with nulls values
train = train.dropna()

# encode nominal features
train_mappings = {}

for col in nominal_columns:
    encoder = LabelEncoder()
    train[col] = encoder.fit_transform(train[col])
    mapping = {category: label for category, label in zip(encoder.classes_, encoder.transform(encoder.classes_))}
    train_mappings[col] = mapping

print(train_mappings)


# scaling numerical features
for col in numerical_columns:
    scaler = StandardScaler()
    train[col] = scaler.fit_transform(train[[col]])

In [None]:
train

In [None]:
# drop rows with nulls values
test = test.dropna()

# encode nominal features
test_mappings = {}

for col in nominal_columns:
    encoder = LabelEncoder()
    test[col] = encoder.fit_transform(test[col])
    mapping = {category: label for category, label in zip(encoder.classes_, encoder.transform(encoder.classes_))}
    test_mappings[col] = mapping

print(test_mappings)


# scaling numerical features
for col in numerical_columns:
    scaler = StandardScaler()
    test[col] = scaler.fit_transform(test[[col]])



In [None]:
test

In [None]:
train, test = train.align(test, join='outer', axis=1, fill_value=0)

In [None]:
train_spark = spark.createDataFrame(train)
test_spark = spark.createDataFrame(test)

In [None]:
FEATURES_COL = train_spark.columns[:-1]
LABEL_COL = train_spark.columns[-1]

In [None]:
assembler = VectorAssembler(inputCols=FEATURES_COL, outputCol='features')
df_train = assembler.transform(train_spark)

assembler = VectorAssembler(inputCols=FEATURES_COL, outputCol='features')
df_test = assembler.transform(test_spark)

In [None]:
rf = RandomForestClassifier(labelCol=LABEL_COL, featuresCol="features", numTrees=80, maxDepth=30, featureSubsetStrategy="log2", seed=42)
model = rf.fit(df_train)

# train
predictions = model.transform(df_train)

evaluator = MulticlassClassificationEvaluator(labelCol="satisfaction", predictionCol="prediction",
                                                metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Train set accuracy = " + str(accuracy))

# test
predictions = model.transform(df_test)

evaluator = MulticlassClassificationEvaluator(labelCol="satisfaction", predictionCol="prediction",
                                                metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Train set accuracy = 0.999874510106763
Test set accuracy = 0.9634264086818831
