In [1]:
import pyspark as ps 

sc = ps.sql.SparkSession.builder.master("local").appName("firstmodel").getOrCreate()

In [2]:
df= sc.read.csv('RegressionData_1.csv',header=True,inferSchema=True)
df

DataFrame[year_target: int, Uid: int, age: int, Pid: int, city: string, state: string, subscribe: int, MARRIAGE: int, EDUCATION: int]

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [4]:
stringindexerInputs = ['city','state']
stringindexerOutputs = ['index_city','index_state']
stringindexer = StringIndexer(inputCols=stringindexerInputs, outputCols=stringindexerOutputs)
#ds=stringindexer.fit(df).transform(df)
#ds.show(5)

In [5]:
assemblerInputs = ['index_city','index_state','year_target','Uid','age','Pid','MARRIAGE','EDUCATION']
vector_assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
#ml_dataset=vector_assembler.transform(ds)
#ml_dataset.show(5)

In [6]:
(trainDataset, testDataset) = df.randomSplit([.7,0.3])

In [7]:
rf_model = RandomForestClassifier(labelCol="subscribe",featuresCol="features",numTrees=20)

In [8]:
pipeline = Pipeline(stages=[stringindexer, vector_assembler, rf_model])

In [9]:
model = pipeline.fit(trainDataset)

In [10]:
predictions=model.transform(testDataset)

In [11]:
predictions.show(5)

+-----------+----+---+------+---------+-----+---------+--------+---------+----------+-----------+--------------------+--------------------+--------------------+----------+
|year_target| Uid|age|   Pid|     city|state|subscribe|MARRIAGE|EDUCATION|index_city|index_state|            features|       rawPrediction|         probability|prediction|
+-----------+----+---+------+---------+-----+---------+--------+---------+----------+-----------+--------------------+--------------------+--------------------+----------+
|       1920| 143| 77|988389|bangalore|   HP|        1|       1|        2|       2.0|        1.0|[2.0,1.0,1920.0,1...|[10.8751007314163...|[0.54375503657081...|       0.0|
|       1920| 173| 66|537792|bangalore|   MP|        0|       2|        1|       2.0|        2.0|[2.0,2.0,1920.0,1...|[10.0080433324573...|[0.50040216662286...|       0.0|
|       1920|1005| 30|638631|    delhi|   HP|        0|       1|        1|       3.0|        1.0|[3.0,1.0,1920.0,1...|[10.0769640279059...|[

In [12]:
evaluator = MulticlassClassificationEvaluator(labelCol="subscribe",predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy_dtc :",accuracy)

Accuracy_dtc : 0.4947691676345243


In [14]:
selected = predictions.select("Uid", "features", "probability", "prediction")
selected.show(5)

+----+--------------------+--------------------+----------+
| Uid|            features|         probability|prediction|
+----+--------------------+--------------------+----------+
| 143|[0.0,1.0,1920.0,1...|[0.47248640126779...|       1.0|
|1407|[2.0,0.0,1920.0,1...|[0.48845394608192...|       1.0|
|1853|[3.0,0.0,1920.0,1...|[0.48379077108386...|       1.0|
|2407|[3.0,1.0,1920.0,2...|[0.44821529468136...|       1.0|
|2859|[1.0,2.0,1920.0,2...|[0.48962694534595...|       1.0|
+----+--------------------+--------------------+----------+
only showing top 5 rows

