In [30]:
# !pip install pyspark

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

In [32]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("RandomForest")\
    .getOrCreate()

In [33]:
train = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("./train.csv")



In [34]:
train

DataFrame[_c0: int, id: int, Gender: string, Customer Type: string, Age: int, Type of Travel: string, Class: string, Flight Distance: int, Inflight wifi service: int, Departure/Arrival time convenient: int, Ease of Online booking: int, Gate location: int, Food and drink: int, Online boarding: int, Seat comfort: int, Inflight entertainment: int, On-board service: int, Leg room service: int, Baggage handling: int, Checkin service: int, Inflight service: int, Cleanliness: int, Departure Delay in Minutes: int, Arrival Delay in Minutes: double, satisfaction: string]

In [35]:

cols_to_drop = ['_c0','id']
train = train.drop(*cols_to_drop)
train = train.toPandas()
df = pd.DataFrame(train)

# Apply LabelEncoder to each column to convert string to num
X = LabelEncoder()
X = df.apply(X.fit_transform)

# Print the transformed DataFrame
X

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,0,6,1,2,358,3,4,3,1,...,5,4,3,3,4,5,5,25,18,0
1,1,1,18,0,0,144,3,2,3,3,...,1,1,5,2,1,4,1,1,6,0
2,0,0,19,0,0,994,2,2,2,2,...,5,4,3,3,4,4,5,0,0,1
3,0,0,18,0,0,446,2,5,5,5,...,2,2,5,2,1,4,2,11,9,0
4,1,0,54,0,0,124,3,3,3,3,...,3,3,4,3,3,3,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,0,1,16,0,1,102,2,1,2,3,...,2,3,1,3,2,3,2,3,0,0
103900,1,0,42,0,0,2143,4,4,4,4,...,5,5,5,4,5,5,4,0,0,1
103901,1,1,23,0,0,1791,1,1,1,3,...,4,3,2,3,5,5,4,7,14,0
103902,0,1,15,0,1,860,1,1,1,5,...,1,4,5,0,5,4,1,0,0,0


In [36]:
X = spark.createDataFrame(X)
LABEL_COL = "satisfaction"
FEATURES_COL = list(set(X.columns) - set([LABEL_COL]))
print("Label used: ", LABEL_COL)
print("Features used: ", FEATURES_COL)

Label used:  satisfaction
Features used:  ['Type of Travel', 'Departure/Arrival time convenient', 'Arrival Delay in Minutes', 'Leg room service', 'Customer Type', 'Checkin service', 'Inflight wifi service', 'On-board service', 'Seat comfort', 'Age', 'Online boarding', 'Gate location', 'Baggage handling', 'Flight Distance', 'Food and drink', 'Gender', 'Inflight entertainment', 'Cleanliness', 'Departure Delay in Minutes', 'Ease of Online booking', 'Inflight service', 'Class']


In [37]:
assembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df = assembler.transform(X)
df

DataFrame[Gender: bigint, Customer Type: bigint, Age: bigint, Type of Travel: bigint, Class: bigint, Flight Distance: bigint, Inflight wifi service: bigint, Departure/Arrival time convenient: bigint, Ease of Online booking: bigint, Gate location: bigint, Food and drink: bigint, Online boarding: bigint, Seat comfort: bigint, Inflight entertainment: bigint, On-board service: bigint, Leg room service: bigint, Baggage handling: bigint, Checkin service: bigint, Inflight service: bigint, Cleanliness: bigint, Departure Delay in Minutes: bigint, Arrival Delay in Minutes: bigint, satisfaction: bigint, features: vector]

In [38]:
#create rf model
rf = RandomForestClassifier(labelCol=LABEL_COL, featuresCol="features", numTrees=80, maxDepth=30, featureSubsetStrategy="log2", seed=42)
#fit the model
model = rf.fit(df)

# Apply the model to generate predictions
predictions = model.transform(df)

# Select the "satisfaction" and "prediction" columns
selected_columns = predictions.select("satisfaction", "prediction")

# Convert the DataFrame to an RDD
rdd = selected_columns.rdd

# Map the RDD to calculate 1.0 for correct predictions and 0.0 for incorrect predictions
accuracy_values = rdd.map(lambda x: 1.0 if x[0] == x[1] else 0.0)

# Calculate the mean value of the accuracy values
accuracy = accuracy_values.mean()

# Print the training accuracy
print("Training Accuracy:", accuracy)

Training Accuracy: 0.9997401447489971


In [39]:
df_val = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("test.csv")

In [40]:

cols_to_drop = ['_c0','id']
df_val = df_val.drop(*cols_to_drop)

df_val = df_val.toPandas()
df = pd.DataFrame(df_val)

# Apply LabelEncoder to each column
X = LabelEncoder()
X = df.apply(X.fit_transform)

# Print the transformed DataFrame
X
X = spark.createDataFrame(X)
assembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df_val = assembler.transform(X)

In [41]:
# Apply the model to generate predictions on the validation dataset
predictions_val = model.transform(df_val)

# Select the "satisfaction" and "prediction" columns
selected_columns_val = predictions_val.select("satisfaction", "prediction")

# Convert the DataFrame to an RDD
rdd_val = selected_columns_val.rdd

# Map the RDD to calculate 1.0 for correct predictions and 0.0 for incorrect predictions
accuracy_values_val = rdd_val.map(lambda x: 1.0 if x[0] == x[1] else 0.0)

# Calculate the mean value of the accuracy values
accuracy_val = accuracy_values_val.mean()

# Print the validation accuracy
print("Validation Accuracy:", accuracy_val)

Validation Accuracy: 0.9488758854327071
