# This codes were written by pysaprk for  creating a Logestic Regression model

In [36]:
from pyspark.sql import SparkSession # Create the Spark session object
spark=SparkSession.builder.appName('logRe').getOrCreate()

In [2]:
#Load the dataset using DataFrame
df=spark.read.csv('Log_Reg_dataset.csv',inferSchema=True,header=True)


In [3]:
#Shape of Dataset
print((df.count(),len(df.columns)))

(20000, 6)


In [4]:
#Check and confirm the datatype of data, might you want to change the types
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)



In [5]:
#Show name of columns
df.columns

['Country', 'Age', 'Repeat_Visitor', 'Platform', 'Web_pages_viewed', 'Status']

In [6]:
#Look at the Dataframe using the show function
df.show(5)

+---------+---+--------------+--------+----------------+------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|
+---------+---+--------------+--------+----------------+------+
|    India| 41|             1|   Yahoo|              21|     1|
|   Brazil| 28|             1|   Yahoo|               5|     0|
|   Brazil| 40|             0|  Google|               3|     0|
|Indonesia| 31|             1|    Bing|              15|     1|
| Malaysia| 32|             0|  Google|              15|     1|
+---------+---+--------------+--------+----------------+------+
only showing top 5 rows



In [7]:
#Statistical measures of the dataset
df.describe().show()

+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|  count|   20000|            20000|            20000|   20000|            20000|             20000|
|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|
| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|
|    min|  Brazil|               17|                0|    Bing|                1|                 0|
|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+



In [8]:
#Use groupby function along with  different function returns the frequency of each of the categories in the data to understand deeper the details of each column,
df.groupBy('Country').count().show()
df.groupBy('Platform').count().show()
df.groupBy('Platform').mean().show()

+---------+-----+
|  Country|count|
+---------+-----+
| Malaysia| 1218|
|    India| 4018|
|Indonesia|12178|
|   Brazil| 2586|
+---------+-----+

+--------+-----+
|Platform|count|
+--------+-----+
|   Yahoo| 9859|
|    Bing| 4360|
|  Google| 5781|
+--------+-----+

+--------+------------------+-------------------+---------------------+------------------+
|Platform|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|       avg(Status)|
+--------+------------------+-------------------+---------------------+------------------+
|   Yahoo|28.569226087838523| 0.5094837204584644|    9.599655137437875|0.5071508266558474|
|    Bing| 28.68394495412844| 0.4720183486238532|    9.114908256880733|0.4559633027522936|
|  Google|28.380038055699707| 0.5149628092025601|    9.804878048780488|0.5210171250648676|
+--------+------------------+-------------------+---------------------+------------------+



In [9]:
#Feature Engineering
from pyspark.ml.feature import StringIndexer # For converting the categorical variable into numerical
from pyspark.ml.feature import VectorAssembler #For combining all the input features

# Since we are dealing with two categorical columns, we will have to convert the country and Platform columns into numerical form

## The first step is to label the column using StringIndexer into numerical form. It allocates unique values to each of the categories of the column.

In [10]:

search_engine_indexer =StringIndexer(inputCol="Platform", outputCol="Platform_Num").fit(df)
df = search_engine_indexer.transform(df)

In [11]:
df.show(3,False)

+-------+---+--------------+--------+----------------+------+------------+
|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|Platform_Num|
+-------+---+--------------+--------+----------------+------+------------+
|India  |41 |1             |Yahoo   |21              |1     |0.0         |
|Brazil |28 |1             |Yahoo   |5               |0     |0.0         |
|Brazil |40 |0             |Google  |3               |0     |1.0         |
+-------+---+--------------+--------+----------------+------+------------+
only showing top 3 rows



In [12]:
#Show the details of Platform column and Platform_Num column
df.groupBy('Platform').count().orderBy('count', ascending=False).show(5,False)
df.groupBy('Platform_Num').count().orderBy('count', ascending=False).show(5,False)

+--------+-----+
|Platform|count|
+--------+-----+
|Yahoo   |9859 |
|Google  |5781 |
|Bing    |4360 |
+--------+-----+

+------------+-----+
|Platform_Num|count|
+------------+-----+
|0.0         |9859 |
|1.0         |5781 |
|2.0         |4360 |
+------------+-----+



# The next step is to represent each of these values into the form of a one hot encoded vector.

In [13]:
from pyspark.ml.feature import OneHotEncoder
search_engine_encoder=OneHotEncoder(inputCol="Platform_Num", outputCol="Platform_Vector")
df = search_engine_encoder.transform(df)

#   Let’s repeat the same procedure for the other categorical column(Country)

In [14]:
country_indexer = StringIndexer(inputCol="Country",outputCol="Country_Num").fit(df)
df = country_indexer.transform(df)
country_encoder = OneHotEncoder(inputCol="Country_Num",outputCol="Country_Vector")
df = country_encoder.transform(df)

## Now that we have converted both the categorical columns into numerical forms, we need to assemble all of the input columns into a single vector that would act as the input feature for the model.

In [15]:
df_assembler = VectorAssembler(inputCols=['Platform_Vector','Country_Vector','Age', 'Repeat_Visitor',
'Web_pages_viewed'], outputCol="features")
df = df_assembler.transform(df)

In [16]:
df.select(['features','Status']).show(10,False)

+-----------------------------------+------+
|features                           |Status|
+-----------------------------------+------+
|[1.0,0.0,0.0,1.0,0.0,41.0,1.0,21.0]|1     |
|[1.0,0.0,0.0,0.0,1.0,28.0,1.0,5.0] |0     |
|(8,[1,4,5,7],[1.0,1.0,40.0,3.0])   |0     |
|(8,[2,5,6,7],[1.0,31.0,1.0,15.0])  |1     |
|(8,[1,5,7],[1.0,32.0,15.0])        |1     |
|(8,[1,4,5,7],[1.0,1.0,32.0,3.0])   |0     |
|(8,[1,4,5,7],[1.0,1.0,32.0,6.0])   |0     |
|(8,[1,2,5,7],[1.0,1.0,27.0,9.0])   |0     |
|(8,[0,2,5,7],[1.0,1.0,32.0,2.0])   |0     |
|(8,[2,5,6,7],[1.0,31.0,1.0,16.0])  |1     |
+-----------------------------------+------+
only showing top 10 rows



### Let us select only features column as input and the Status column as output for training the logistic regression model.

In [17]:
model_df=df.select(['features','Status'])

### Splitting the Dataset

In [18]:
training_df,test_df=model_df.randomSplit([0.75,0.25])
print(training_df.count())
print(test_df.count())

15070
4930


### Build and Train Logistic Regression Model

In [19]:
from pyspark.ml.classification import LogisticRegression
log_reg=LogisticRegression(labelCol='Status').fit(training_df)

# Training Results

In [20]:
#For evauating our model and follow what our model predicted lables 
train_results=log_reg.evaluate(training_df).predictions

In [21]:
train_results.show(5)

+--------------------+------+--------------------+--------------------+----------+
|            features|Status|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(8,[0,2,5,7],[1.0...|     0|[5.93130796852951...|[0.99735202334563...|       0.0|
|(8,[0,2,5,7],[1.0...|     0|[5.93130796852951...|[0.99735202334563...|       0.0|
|(8,[0,2,5,7],[1.0...|     0|[5.93130796852951...|[0.99735202334563...|       0.0|
|(8,[0,2,5,7],[1.0...|     0|[5.93130796852951...|[0.99735202334563...|       0.0|
|(8,[0,2,5,7],[1.0...|     0|[5.93130796852951...|[0.99735202334563...|       0.0|
+--------------------+------+--------------------+--------------------+----------+
only showing top 5 rows



In [22]:
#Filter lables which our model predicted correctly as 1(label=1) and selected three columns 'Status','prediction','probability'
train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).select(['Status','prediction','probability']).show(10,False)

+------+----------+-----------------------------------------+
|Status|prediction|probability                              |
+------+----------+-----------------------------------------+
|1     |1.0       |[0.3085128352044721,0.691487164795528]   |
|1     |1.0       |[0.3085128352044721,0.691487164795528]   |
|1     |1.0       |[0.1742514347904196,0.8257485652095804]  |
|1     |1.0       |[0.09075088014880064,0.9092491198511994] |
|1     |1.0       |[0.09075088014880064,0.9092491198511994] |
|1     |1.0       |[0.09075088014880064,0.9092491198511994] |
|1     |1.0       |[0.09075088014880064,0.9092491198511994] |
|1     |1.0       |[0.09075088014880064,0.9092491198511994] |
|1     |1.0       |[0.045079054372010874,0.9549209456279892]|
|1     |1.0       |[0.045079054372010874,0.9549209456279892]|
+------+----------+-----------------------------------------+
only showing top 10 rows



# Evaluate Linear Regression Model on Test Data

In [23]:
results=log_reg.evaluate(test_df).predictions

In [24]:
results.select(['Status','prediction']).show(10,False)

+------+----------+
|Status|prediction|
+------+----------+
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|1     |0.0       |
|0     |0.0       |
+------+----------+
only showing top 10 rows



# Build Confusion Matrix

#### We will manually create the variables for true positives, true negatives, false positives, and false negatives to understand them better rather than using the direct inbuilt function.

In [29]:
tp=results.filter((results['Status']==1) & (results['prediction']==1)).count()
tn=results.filter((results['Status']==0) & (results['prediction']==0)).count()
fp=results.filter((results['Status']==0) & (results['prediction']==1)).count()
fn=results.filter((results['Status']==1) & (results['prediction']==0)).count()

### Accuracy

In [33]:
accuracy=float((tp+tn) /(results.count()))
print("Accurcy=",accuracy)

Accurcy= 0.9356997971602434


### Recall 

In [34]:
recall =float((tp) /(tp+fn))
print("Recall=",recall)

Recall= 0.9313287281592848


### Precision

In [35]:
Precision =float((tp) /(tp+fp))
print("Precision=",Precision)

Precision= 0.9393442622950819
