# Spark MLlib Exercises


http://spark.apache.org/docs/latest/ml-statistics.html

In [3]:
# Tworzymy własne środowisko
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar -xvf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark

import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()
from pyspark.sql import SparkSession

spark-3.1.2-bin-hadoop3.2/
spark-3.1.2-bin-hadoop3.2/R/
spark-3.1.2-bin-hadoop3.2/R/lib/
spark-3.1.2-bin-hadoop3.2/R/lib/sparkr.zip
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/worker/
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/worker/worker.R
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/worker/daemon.R
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/tests/
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/tests/testthat/
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/tests/testthat/test_basic.R
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/profile/
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/profile/shell.R
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/profile/general.R
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/doc/
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/doc/sparkr-vignettes.html
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/doc/sparkr-vignettes.Rmd
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/doc/sparkr-vignettes.R
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/doc/index.html
spark-3.1.2-bin-hadoop3.2/R/lib/SparkR/R/
spark-3.1.2-

In [4]:
# Inicjalizacja
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
spark

## 1. Statistics (1p.)

Download the following dataset: https://www.kaggle.com/c/titanic/data?select=train.csv

In [6]:
file = "titanic_train.csv"
titanic_df = spark.read.format("csv").options(inferSchema="true", header="true").load(file)
titanic_df = titanic_df.dropna(how='any')
titanic_df.show(10)
print(titanic_df.dtypes)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|   Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-------+-----------+--------+
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|PC 17599|71.2833|        C85|       C|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|  113803|   53.1|       C123|       S|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|   17463|51.8625|        E46|       S|
|         11|       1|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1| PP 9549|   16.7|         G6|       S|
|         12|       1|     1|Bonnell, Miss. El...|female|58.0|    0|    0|  113783|  26.55|       C103|       S|
|         22|       1|     2|Beesley, Mr. Lawr...|  male|34.0|    0|    0|  248698|   13.0|     

### Exercise 1.A.
**TODO:** Calculate descriptive statistics for 'Age' and 'Fare' (see https://spark.apache.org/docs/1.6.1/api/java/org/apache/spark/sql/DataFrame.html#describe(scala.collection.Seq))

In [8]:
titanic_df.describe("Age", "Fare").show()

+-------+------------------+-----------------+
|summary|               Age|             Fare|
+-------+------------------+-----------------+
|  count|               183|              183|
|   mean|  35.6744262295082|78.68246885245901|
| stddev|15.643865966849717|76.34784270040569|
|    min|              0.92|              0.0|
|    max|              80.0|         512.3292|
+-------+------------------+-----------------+



### Exercise 1.B.

**TODO:** Check if 'Age' and 'Fare' have normal distribution (see http://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/stat/KolmogorovSmirnovTest.html)

In [9]:
from pyspark.ml.stat import KolmogorovSmirnovTest

# We check a standard normal distribution
ksResult_age = KolmogorovSmirnovTest.test(titanic_df, 'Age', 'norm', 0.0, 1.0)
ksResult_fare = KolmogorovSmirnovTest.test(titanic_df, 'Fare', 'norm', 0.0, 1.0)

In [21]:
ksResult_age.first()

Row(pValue=1.943689653671754e-11, statistic=0.9713276975967852)

In [22]:
ksResult_fare.first()

Row(pValue=8.816725127758218e-12, statistic=0.9890707515997943)

In both cases we should reject the null hypothesis, so the variables have a non-normal distribution.

### Exercise 1.C.

**TODO:** Calculate Pearson correlation between the following pairs of features:  
* 'Age' and 'Survived'
* 'Sex' and 'Survived' *(remember about encoding 'Sex' attributes as 0s and 1s)*

Which correlation is stronger?

In [23]:
from pyspark.ml.stat import Correlation

In [26]:
titanic_df.corr("Age", "Survived", method="pearson")

-0.2540847542030532

In [29]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, udf

def str2int(sex: str) -> int:
  return int(sex == "male") # We encode male as 1 and female as 0

convert_sex_to_int = udf(lambda x: str2int(x), IntegerType())

titanic_df_sex_encoded = titanic_df.select("Sex", "Survived").withColumn("Sex", convert_sex_to_int(col("Sex")))
titanic_df_sex_encoded.show()

+---+--------+
|Sex|Survived|
+---+--------+
|  0|       1|
|  0|       1|
|  1|       0|
|  0|       1|
|  0|       1|
|  1|       1|
|  1|       1|
|  1|       0|
|  0|       1|
|  1|       0|
|  1|       0|
|  0|       1|
|  1|       0|
|  0|       1|
|  1|       0|
|  1|       0|
|  1|       1|
|  1|       0|
|  1|       0|
|  1|       0|
+---+--------+
only showing top 20 rows



In [30]:
titanic_df_sex_encoded.corr("Sex", "Survived", method="pearson")

-0.5324179744538412

The second correlation is stronger, because its value is greater with respect to the absolute value.

## 2. Loading data

Doc: http://spark.apache.org/docs/latest/ml-datasource.html 

Download data from https://github.com/apache/spark/blob/master/data/mllib/sample_libsvm_data.txt and load as DataFrame. 

In [34]:
file = "sample_libsvm_data.txt"

df = spark.read.format("libsvm").option("numFeatures", "780").load(file)
df.show(10)
df.take(1)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(780,[127,128,129...|
|  1.0|(780,[158,159,160...|
|  1.0|(780,[124,125,126...|
|  1.0|(780,[152,153,154...|
|  1.0|(780,[151,152,153...|
|  0.0|(780,[129,130,131...|
|  1.0|(780,[158,159,160...|
|  1.0|(780,[99,100,101,...|
|  0.0|(780,[154,155,156...|
|  0.0|(780,[127,128,129...|
+-----+--------------------+
only showing top 10 rows



[Row(label=0.0, features=SparseVector(780, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0

### Exercise 2.A
**TODO:** Load wine data from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/wine.scale
Dataset description: http://archive.ics.uci.edu/ml/datasets/Wine

In [35]:
!wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/wine.scale

--2022-12-29 16:53:30--  https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/wine.scale
Resolving www.csie.ntu.edu.tw (www.csie.ntu.edu.tw)... 140.112.30.26
Connecting to www.csie.ntu.edu.tw (www.csie.ntu.edu.tw)|140.112.30.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28116 (27K)
Saving to: ‘wine.scale’


2022-12-29 16:53:30 (5.59 MB/s) - ‘wine.scale’ saved [28116/28116]



In [36]:
file = "wine.scale"
df_wine = spark.read.format("libsvm").option("numFeatures", "13").load(file)
df_wine.show(10)
df_wine.take(1)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
|  1.0|(13,[0,1,2,3,4,5,...|
+-----+--------------------+
only showing top 10 rows



[Row(label=1.0, features=SparseVector(13, {0: 0.6842, 1: -0.6166, 2: 0.1444, 3: -0.4845, 4: 0.2391, 5: 0.2552, 6: 0.1477, 7: -0.434, 8: 0.1861, 9: -0.256, 10: -0.0894, 11: 0.9414, 12: 0.1227}))]

## 3. Classification (2p.)

In [37]:
!wget https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv

--2022-12-29 16:58:02--  https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10889 (11K) [text/plain]
Saving to: ‘wine.csv’


2022-12-29 16:58:02 (21.6 MB/s) - ‘wine.csv’ saved [10889/10889]



In [38]:
file = "wine.csv" # https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv

# Remember about deleting dots from the headers of this csv file!
winedf2 = spark.read.format("csv").options(inferSchema="true", header="true").load(file)
winedf2.show(10)
print(winedf2.dtypes)

+----+-------+----------+----+----+---+-------+----------+--------------------+-------+---------+----+----+-------+
|Wine|Alcohol|Malic.acid| Ash| Acl| Mg|Phenols|Flavanoids|Nonflavanoid.phenols|Proanth|Color.int| Hue|  OD|Proline|
+----+-------+----------+----+----+---+-------+----------+--------------------+-------+---------+----+----+-------+
|   1|  14.23|      1.71|2.43|15.6|127|    2.8|      3.06|                0.28|   2.29|     5.64|1.04|3.92|   1065|
|   1|   13.2|      1.78|2.14|11.2|100|   2.65|      2.76|                0.26|   1.28|     4.38|1.05| 3.4|   1050|
|   1|  13.16|      2.36|2.67|18.6|101|    2.8|      3.24|                 0.3|   2.81|     5.68|1.03|3.17|   1185|
|   1|  14.37|      1.95| 2.5|16.8|113|   3.85|      3.49|                0.24|   2.18|      7.8|0.86|3.45|   1480|
|   1|  13.24|      2.59|2.87|21.0|118|    2.8|      2.69|                0.39|   1.82|     4.32|1.04|2.93|    735|
|   1|   14.2|      1.76|2.45|15.2|112|   3.27|      3.39|              

### Exercise 3.A
**TODO:** 

Remember about deleting dots from the headers of this csv file and splitting data into train and test set


1) Create pipeline with VectorAssembler and DecisionTreeClassifier.

2) Use the pipeline to make predictions.

3) Evaluate predictions using MulticlassClassificationEvaluator.

4) Calculate accuracy and test error

5) Print the structure of the trained decision tree (hint: use toDebugString attribute)

Firstly we remove dots from the headers.

In [39]:
winedf2 = winedf2.withColumnRenamed("Malic.acid", "Malic_acid")
winedf2 = winedf2.withColumnRenamed("Nonflavanoid.phenols", "Nonflavanoid_phenols")
winedf2 = winedf2.withColumnRenamed("Color.int", "Color_int")
winedf2.show()

+----+-------+----------+----+----+---+-------+----------+--------------------+-------+---------+----+----+-------+
|Wine|Alcohol|Malic_acid| Ash| Acl| Mg|Phenols|Flavanoids|Nonflavanoid_phenols|Proanth|Color_int| Hue|  OD|Proline|
+----+-------+----------+----+----+---+-------+----------+--------------------+-------+---------+----+----+-------+
|   1|  14.23|      1.71|2.43|15.6|127|    2.8|      3.06|                0.28|   2.29|     5.64|1.04|3.92|   1065|
|   1|   13.2|      1.78|2.14|11.2|100|   2.65|      2.76|                0.26|   1.28|     4.38|1.05| 3.4|   1050|
|   1|  13.16|      2.36|2.67|18.6|101|    2.8|      3.24|                 0.3|   2.81|     5.68|1.03|3.17|   1185|
|   1|  14.37|      1.95| 2.5|16.8|113|   3.85|      3.49|                0.24|   2.18|      7.8|0.86|3.45|   1480|
|   1|  13.24|      2.59|2.87|21.0|118|    2.8|      2.69|                0.39|   1.82|     4.32|1.04|2.93|    735|
|   1|   14.2|      1.76|2.45|15.2|112|   3.27|      3.39|              

Splitting data into train and test set in proportion 70:30 respectively. We use stratified sampling.

In [58]:
wine_train = winedf2.sampleBy("Wine", fractions={1: 0.7, 2: 0.7, 3: 0.7}, seed=10)
wine_test = winedf2.subtract(wine_train)

wine_test.show()

+----+-------+----------+----+----+---+-------+----------+--------------------+-------+---------+----+----+-------+
|Wine|Alcohol|Malic_acid| Ash| Acl| Mg|Phenols|Flavanoids|Nonflavanoid_phenols|Proanth|Color_int| Hue|  OD|Proline|
+----+-------+----------+----+----+---+-------+----------+--------------------+-------+---------+----+----+-------+
|   3|   13.4|      3.91|2.48|23.0|102|    1.8|      0.75|                0.43|   1.41|      7.3| 0.7|1.56|    750|
|   1|   13.2|      1.78|2.14|11.2|100|   2.65|      2.76|                0.26|   1.28|     4.38|1.05| 3.4|   1050|
|   2|  11.46|      3.74|1.82|19.5|107|   3.18|      2.58|                0.24|   3.58|      2.9|0.75|2.81|    562|
|   2|  11.82|      1.72|1.88|19.5| 86|    2.5|      1.64|                0.37|   1.42|     2.06|0.94|2.44|    415|
|   2|  11.87|      4.31|2.39|21.0| 82|   2.86|      3.03|                0.21|   2.91|      2.8|0.75|3.64|    380|
|   1|  14.39|      1.87|2.45|14.6| 96|    2.5|      2.52|              

1) Create pipeline with VectorAssembler and DecisionTreeClassifier.

In [59]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

num_classes = wine_train.select("Wine").distinct().count()
feature_cols = wine_train.columns[1:]

vec_assembler = VectorAssembler(inputCols = feature_cols, outputCol="features")
decision_tree = DecisionTreeClassifier(labelCol="Wine", featuresCol="features")

pipeline = Pipeline(stages=[vec_assembler, decision_tree])

2) Use the pipeline to make predictions.

In [60]:
model = pipeline.fit(wine_train)
predictions = model.transform(wine_test)

3) Evaluate predictions using MulticlassClassificationEvaluator.

In [61]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Wine", predictionCol="prediction", metricName="accuracy")

4) Calculate accuracy and test error.

In [62]:
y_true = predictions.select(['Wine']).collect()
y_pred = predictions.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           1       0.75      0.83      0.79        18
           2       0.86      0.76      0.81        25
           3       0.92      1.00      0.96        12

    accuracy                           0.84        55
   macro avg       0.85      0.86      0.85        55
weighted avg       0.84      0.84      0.84        55



5) Print the structure of the trained decision tree (hint: use toDebugString attribute).

In [63]:
tree_model = model.stages[1]
print(tree_model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_01df802709c3, depth=4, numNodes=15, numClasses=4, numFeatures=13
  If (feature 9 <= 3.9450000000000003)
   If (feature 11 <= 3.7)
    If (feature 6 <= 0.495)
     Predict: 3.0
    Else (feature 6 > 0.495)
     If (feature 2 <= 2.8499999999999996)
      Predict: 2.0
     Else (feature 2 > 2.8499999999999996)
      Predict: 1.0
   Else (feature 11 > 3.7)
    Predict: 1.0
  Else (feature 9 > 3.9450000000000003)
   If (feature 6 <= 2.075)
    If (feature 2 <= 2.0700000000000003)
     Predict: 2.0
    Else (feature 2 > 2.0700000000000003)
     Predict: 3.0
   Else (feature 6 > 2.075)
    If (feature 0 <= 11.585)
     Predict: 2.0
    Else (feature 0 > 11.585)
     Predict: 1.0



### Exercise 3.B
**TODO:** 

1) Extend the pipeline from the previos task with QuantileDiscretizer 

2) Try using a couple of different numbers of buckets, which cinfiguration gives the best results?

3) Can you see any difference in the structure of the decistion tree?

In [66]:
from pyspark.ml.feature import QuantileDiscretizer

def fit_decision_tree2(train_df, test_df, num_buckets: int, label_col="Wine"):

    num_classes = train_df.select(label_col).distinct().count()
    feature_cols = train_df.columns[1:]
    discretized_cols = [f"{col}_disc" for col in train_df.columns[1:]]
    
    discretizer = QuantileDiscretizer(
        inputCols=feature_cols,
        outputCols=discretized_cols,
        numBuckets=num_buckets
    )
    vec_assembler = VectorAssembler(
        inputCols=discretized_cols, 
        outputCol="features"
    )
    decision_tree = DecisionTreeClassifier(
        labelCol=label_col, 
        featuresCol="features",
        
    )
    pipeline = Pipeline(stages=[discretizer, vec_assembler, decision_tree]) 

    model = pipeline.fit(train_df)
    predictions = model.transform(test_df)

    evaluator = MulticlassClassificationEvaluator(
        labelCol="Wine", 
        predictionCol="prediction", 
        metricName="accuracy"
    )
    accuracy = evaluator.evaluate(predictions) * 100
    
    return model, accuracy

In [67]:
for num_bins in range(2, 6):
    print("Bins:", num_bins)
    model, accuracy = fit_decision_tree2(wine_train, wine_test, num_bins)
    print(f"Accuracy: {accuracy:.2f}")

    tree_model = model.stages[2]
    print(tree_model.toDebugString)
    print()

Bins: 2
Accuracy: 85.45
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_4fa7567ac73b, depth=5, numNodes=25, numClasses=4, numFeatures=13
  If (feature 0 in {0.0})
   If (feature 9 in {0.0})
    If (feature 10 in {0.0})
     If (feature 11 in {0.0})
      If (feature 7 in {0.0})
       Predict: 3.0
      Else (feature 7 not in {0.0})
       Predict: 2.0
     Else (feature 11 not in {0.0})
      Predict: 2.0
    Else (feature 10 not in {0.0})
     If (feature 1 in {0.0})
      Predict: 2.0
     Else (feature 1 not in {0.0})
      If (feature 12 in {0.0})
       Predict: 2.0
      Else (feature 12 not in {0.0})
       Predict: 1.0
   Else (feature 9 not in {0.0})
    If (feature 6 in {0.0})
     If (feature 10 in {0.0})
      Predict: 3.0
     Else (feature 10 not in {0.0})
      Predict: 2.0
    Else (feature 6 not in {0.0})
     Predict: 2.0
  Else (feature 0 not in {0.0})
   If (feature 6 in {0.0})
    If (feature 1 in {0.0})
     If (feature 2 in {0.0})
      Predict: 2.0


The configuration with three bins has the best accuracy.

In the structure of the decision trees with have bins instead of specific values on a level of value distribution.

## 4. Text classification (2p.)

### Exercise 4
**TODO:** 
Build a pipeline consisting of Tokenizer, HashingTF, IDF and StringIndexer and LogisticRegression, fit it to training data: 
http://help.sentiment140.com/for-students/

What is the accuracy of this classifier?

In [68]:
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

--2022-12-29 21:19:15--  http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip [following]
--2022-12-29 21:19:15--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip’


2022-12-29 21:19:20 (17.2 MB/s) - ‘trainingandtestdata.zip’ saved [81363704/81363704]



In [72]:
!unzip /content/trainingandtestdata.zip -d /content/sentiment140

Archive:  /content/trainingandtestdata.zip
  inflating: /content/sentiment140/testdata.manual.2009.06.14.csv  
  inflating: /content/sentiment140/training.1600000.processed.noemoticon.csv  


We also rename these files on the left side of the Google Colab panel.

In [78]:
file = './sentiment140/train.csv'
train_sentiment_df = spark.read.format("csv").options(inferSchema="true", header="false").load(file)

file = './sentiment140/test.csv'
test_sentiment_df = spark.read.format("csv").options(inferSchema="true", header="false").load(file)

In [79]:
train_sentiment_df.show()

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|  0|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|  0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|  0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|  0|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nop

In [80]:
test_sentiment_df.show()

+---+---+--------------------+-------+--------------+--------------------+
|_c0|_c1|                 _c2|    _c3|           _c4|                 _c5|
+---+---+--------------------+-------+--------------+--------------------+
|  4|  3|Mon May 11 03:17:...|kindle2|        tpryan|@stellargirl I lo...|
|  4|  4|Mon May 11 03:18:...|kindle2|        vcu451|Reading my kindle...|
|  4|  5|Mon May 11 03:18:...|kindle2|        chadfu|Ok, first assesme...|
|  4|  6|Mon May 11 03:19:...|kindle2|         SIX15|@kenburbary You'l...|
|  4|  7|Mon May 11 03:21:...|kindle2|      yamarama|@mikefish  Fair e...|
|  4|  8|Mon May 11 03:22:...|kindle2|  GeorgeVHulme|@richardebaker no...|
|  0|  9|Mon May 11 03:22:...|    aig|       Seth937|Fuck this economy...|
|  4| 10|Mon May 11 03:26:...| jquery|     dcostalis|Jquery is my new ...|
|  4| 11|Mon May 11 03:27:...|twitter|       PJ_King|       Loves twitter|
|  4| 12|Mon May 11 03:29:...|  obama|   mandanicole|how can you not l...|
|  2| 13|Mon May 11 03:32

In [81]:
columns = ["label", "id", "date", "query", "user", "text"]

for old_col_name, new_col_name in zip(train_sentiment_df.columns, columns):
  train_sentiment_df = train_sentiment_df.withColumnRenamed(old_col_name, new_col_name)
  test_sentiment_df = test_sentiment_df.withColumnRenamed(old_col_name, new_col_name)

train_sentiment_df.show(5)

+-----+----------+--------------------+--------+---------------+--------------------+
|label|        id|                date|   query|           user|                text|
+-----+----------+--------------------+--------+---------------+--------------------+
|    0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|    0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|    0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|    0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|    0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+-----+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



We should check labels.

In [85]:
train_sentiment_df.select("label").distinct().show()

+-----+
|label|
+-----+
|    4|
|    0|
+-----+



In [86]:
test_sentiment_df.select("label").distinct().show()

+-----+
|label|
+-----+
|    4|
|    2|
|    0|
+-----+



We remove observation classified as 2 from the test dataset, since there is no class 2 in the train dataset.

In [87]:
test_sentiment_df = test_sentiment_df.where(test_sentiment_df.label != 2)
test_sentiment_df.select("label").distinct().show()

+-----+
|label|
+-----+
|    4|
|    0|
+-----+



In [89]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, Tokenizer
from pyspark.ml.classification import LogisticRegression

tokenizer = Tokenizer(
    inputCol="text", 
    outputCol="tokens"
)
hashing_tf = HashingTF(
    inputCol="tokens", 
    outputCol="features", 
    numFeatures=50
)
idf = IDF(
    inputCol="features", 
    outputCol="final_features"
)
string_indexer = StringIndexer(
    inputCol="label", 
    outputCol="final_label"
)
classifier = LogisticRegression(
    featuresCol="final_features", 
    labelCol="final_label", 
    predictionCol="prediction"
)

pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf, string_indexer, classifier])

model = pipeline.fit(train_sentiment_df)
predictions = model.transform(test_sentiment_df)

evaluator = MulticlassClassificationEvaluator(
    labelCol="final_label", 
    predictionCol="prediction", 
    metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions) * 100

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 53.20
