In [40]:
!pip install pyspark



In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

In [42]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("RandomForest")\
    .getOrCreate()

In [43]:
train = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("./train.csv")



In [44]:
train

DataFrame[_c0: int, id: int, Gender: string, Customer Type: string, Age: int, Type of Travel: string, Class: string, Flight Distance: int, Inflight wifi service: int, Departure/Arrival time convenient: int, Ease of Online booking: int, Gate location: int, Food and drink: int, Online boarding: int, Seat comfort: int, Inflight entertainment: int, On-board service: int, Leg room service: int, Baggage handling: int, Checkin service: int, Inflight service: int, Cleanliness: int, Departure Delay in Minutes: int, Arrival Delay in Minutes: double, satisfaction: string]

In [45]:

# train.drop(columns=['Unnamed: 0','id'],inplace = True)
cols_to_drop = ['Unnamed','id']
train = train.drop(*cols_to_drop)
# test.drop(columns=['Unnamed: 0','id'],inplace = True)
# train = train.drop('satisfaction', axis=1)
train = train.toPandas()
df = pd.DataFrame(train)

# Apply LabelEncoder to each column
X = LabelEncoder()
X = df.apply(X.fit_transform)

# Print the transformed DataFrame
X

Unnamed: 0,_c0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,1,0,6,1,2,358,3,4,3,...,5,4,3,3,4,5,5,25,18,0
1,1,1,1,18,0,0,144,3,2,3,...,1,1,5,2,1,4,1,1,6,0
2,2,0,0,19,0,0,994,2,2,2,...,5,4,3,3,4,4,5,0,0,1
3,3,0,0,18,0,0,446,2,5,5,...,2,2,5,2,1,4,2,11,9,0
4,4,1,0,54,0,0,124,3,3,3,...,3,3,4,3,3,3,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,103899,0,1,16,0,1,102,2,1,2,...,2,3,1,3,2,3,2,3,0,0
103900,103900,1,0,42,0,0,2143,4,4,4,...,5,5,5,4,5,5,4,0,0,1
103901,103901,1,1,23,0,0,1791,1,1,1,...,4,3,2,3,5,5,4,7,14,0
103902,103902,0,1,15,0,1,860,1,1,1,...,1,4,5,0,5,4,1,0,0,0


In [46]:
X = spark.createDataFrame(X)
LABEL_COL = "satisfaction"
FEATURES_COL = list(set(X.columns) - set([LABEL_COL]))
print("Label used: ", LABEL_COL)
print("Features used: ", FEATURES_COL)

Label used:  satisfaction
Features used:  ['Gate location', 'Seat comfort', 'Arrival Delay in Minutes', 'Age', 'Baggage handling', 'Food and drink', 'Inflight wifi service', 'Inflight service', 'Inflight entertainment', 'Customer Type', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gender', '_c0', 'Cleanliness', 'Class', 'Online boarding', 'Flight Distance', 'Departure Delay in Minutes', 'Type of Travel', 'Leg room service', 'Checkin service', 'On-board service']


In [47]:
assembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df = assembler.transform(X)

In [48]:
rf = RandomForestClassifier(labelCol=LABEL_COL, featuresCol="features", numTrees=50, maxDepth=15, featureSubsetStrategy="log2", seed=42)
model = rf.fit(df)
# Training accuracy
accuracy = model.transform(df).select("satisfaction", "prediction").rdd.map(lambda x: 1.0 if x[0] == x[1] else 0.0).mean()
print("Training Accuracy: ", accuracy)

Training Accuracy:  0.9764782876501391


In [51]:
df_val = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("test.csv")

In [52]:

cols_to_drop = ['Unnamed','id']
df_val = df_val.drop(*cols_to_drop)
# test.drop(columns=['Unnamed: 0','id'],inplace = True)
# train = train.drop('satisfaction', axis=1)
df_val = df_val.toPandas()
df = pd.DataFrame(df_val)

# Apply LabelEncoder to each column
X = LabelEncoder()
X = df.apply(X.fit_transform)

# Print the transformed DataFrame
X
X = spark.createDataFrame(X)
assembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df_val = assembler.transform(X)

In [53]:
accuracy = model.transform(df_val).select("satisfaction", "prediction").rdd.map(lambda x: 1.0 if x[0] == x[1] else 0.0).mean()
print("Validation Accuracy: ", accuracy)

Validation Accuracy:  0.9470280258700339
