# Logistic Regression in Apache pyspark

## Introduction:

## Motivation:

## Design:

###                        Step 1: Loading libraries

In [12]:

from pyspark.sql import SQLContext#For loading the csv files as dataframes
sqlContext = SQLContext(sc)
from pyspark.ml.tuning import TrainValidationSplit#For doing train test split
from pyspark.ml.classification import LogisticRegression#model builder function
from pyspark.sql import SparkSession#to create spark session
from pyspark.ml import Pipeline#Pipeline for creating a flow of processes to be done on data
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler#Data converson functions
from pyspark.ml.evaluation import BinaryClassificationEvaluator#Model evaluator function





### Step 2: Initializing the spark session

In [13]:
if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("LR") \
        .getOrCreate()

### Step 3: Loading data and pre-processing 

In [14]:

df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('/home/meaww/Downloads/HR_comma_sep.csv')
cols=df.columns#getting column names of the data

In [15]:
df.head(5)#Viewing the first 5 rows of the data get some idea about the data

[Row(satisfaction_level=0.38, last_evaluation=0.53, number_project=2, average_montly_hours=157, time_spend_company=3, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'low'),
 Row(satisfaction_level=0.8, last_evaluation=0.86, number_project=5, average_montly_hours=262, time_spend_company=6, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'medium'),
 Row(satisfaction_level=0.11, last_evaluation=0.88, number_project=7, average_montly_hours=272, time_spend_company=4, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'medium'),
 Row(satisfaction_level=0.72, last_evaluation=0.87, number_project=5, average_montly_hours=223, time_spend_company=5, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'low'),
 Row(satisfaction_level=0.37, last_evaluation=0.52, number_project=2, average_montly_hours=159, time_spend_company=3, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'lo

In [16]:
cols#Viewing the column names of the data

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'sales',
 'salary']

In [17]:
#Identifying the categorical variables so that they can be encoded as numeric to be integrated in the model
catcols=["sales","salary"]

In [18]:
stages=[]
for c in catcols:
    strIndexer=StringIndexer(inputCol=c, outputCol=c+"Index")
    encoder=OneHotEncoder(inputCol=c+"Index", outputCol=c+"classVec")
    stages = stages + [strIndexer,encoder]
    
label_idx=StringIndexer(inputCol="left",outputCol="label")
stages = stages+[label_idx]
    
numcols=["satisfaction_level","last_evaluation","number_project","average_montly_hours","time_spend_company",
         "Work_accident","promotion_last_5years"]

assem_ip=map(lambda c: c+ "classVec", catcols) + numcols
assembler=VectorAssembler(inputCols=assem_ip,outputCol="features")
stages=stages+[assembler]

In [19]:
#Doing all the process of stages in a pipeline
pipeline=Pipeline(stages=stages)

pl_ml=pipeline.fit(df)
df=pl_ml.transform(df)

sel_cols=["label","features"]+cols

df=df.select(sel_cols)

In [20]:
df.head()#Viewing the first row of the transformed data

Row(label=1.0, features=SparseVector(18, {0: 1.0, 9: 1.0, 11: 0.38, 12: 0.53, 13: 2.0, 14: 157.0, 15: 3.0}), satisfaction_level=0.38, last_evaluation=0.53, number_project=2, average_montly_hours=157, time_spend_company=3, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'low')

### Step 4: Splitting into train and test and model building process

In [21]:
train, test = df.randomSplit([0.75, 0.25], seed=141)#Splitting into train and test
lr = LogisticRegression(maxIter=100, regParam=0.3)#buiding a linear regression model
# Fit the model on the train data
lrModel = lr.fit(train)

### Step 5: Model evaluation process

In [30]:
predictions = lrModel.transform(test)#Predicting on test data
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")#Calling the evaluator function
evaluator.evaluate(predictions)#Getting models performance

0.8052813342286241




In [33]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))


Coefficients: [0.0241288152874,0.0311345364118,0.0515342912541,-0.0259221737696,0.000719485546598,-0.0299886908688,-0.194978042699,0.0599593637189,0.125851517991,0.204817567347,-0.0394367734893,-1.32491947181,0.0144600802804,-0.0167936562558,0.00108419081115,0.0783239547735,-0.386446828376,-0.369147881284]
Intercept: -0.880250209332


### Challenges Faced:

# Conclusion: