In [1]:
import numpy as np

import findspark
findspark.init()

In [2]:
#1 - import module
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy
import pandas

In [3]:
# GCS Config
GCP_PROJECT = 'NS01-Project'
MODEL_BUCKET = 'gs://twitter_testtt'
VERSION_NAME = 'v1'
MODEL_NAME = 'xgmodel'

In [4]:
#3 - Setup SparkSession (SparkSQL)
spark = (SparkSession
         .builder
         .appName("DataFrameHandOn")
         .master("local[*]")
         .getOrCreate())
print(spark)

<pyspark.sql.session.SparkSession object at 0x7f5337f73a00>


In [5]:
df = spark.read.csv("gs://twitter_testtt/labeled_data.csv", header=True, inferSchema=True)
df.cache()
print("finish caching data")

[Stage 1:>                                                          (0 + 1) / 1]

finish caching data


                                                                                

In [6]:
df.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+---+-----+-----------+------------------+-------+-----+--------------------+
|_c0|count|hate_speech|offensive_language|neither|class|               tweet|
+---+-----+-----------+------------------+-------+-----+--------------------+
|  0|    3|          0|                 0|      3|    2|!!! RT @mayasolov...|
|  1|    3|          0|                 3|      0|    1|!!!!! RT @mleew17...|
|  2|    3|          0|                 3|      0|    1|!!!!!!! RT @UrKin...|
|  3|    3|          0|                 2|      1|    1|!!!!!!!!! RT @C_G...|
|  4|    6|          0|                 6|      0|    1|!!!!!!!!!!!!! RT ...|
+---+-----+-----------+------------------+-------+-----+--------------------+
only showing top 5 rows



                                                                                

In [7]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- count: string (nullable = true)
 |-- hate_speech: string (nullable = true)
 |-- offensive_language: string (nullable = true)
 |-- neither: integer (nullable = true)
 |-- class: integer (nullable = true)
 |-- tweet: string (nullable = true)



In [8]:
pd_df = df.toPandas()

In [9]:
pd_df.head(30)

Unnamed: 0,_c0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3.0,0.0,0.0,3.0,2.0,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3.0,0.0,3.0,0.0,1.0,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3.0,0.0,3.0,0.0,1.0,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3.0,0.0,2.0,1.0,1.0,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6.0,0.0,6.0,0.0,1.0,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3.0,1.0,2.0,0.0,1.0,"""!!!!!!!!!!!!!!!!!!""""@T_Madison_x: The shit ju..."
6,6,3.0,0.0,3.0,0.0,1.0,"""!!!!!!""""@__BrighterDays: I can not just sit u..."
7,7,3.0,0.0,3.0,0.0,1.0,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,8,3.0,0.0,3.0,0.0,1.0,""""""" &amp; you might not get ya bitch back &amp..."
9,9,3.0,1.0,2.0,0.0,1.0,""""""" @rhythmixx_ :hobbies include: fighting Mar..."


In [10]:
df = df.dropna()

In [11]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

def onlyTwoClass(x):
    return 1 if str(x)>str(1) else 0

my_udf = udf(onlyTwoClass, IntegerType())

In [12]:
new_df = df.withColumn('class', my_udf('class'))

In [13]:
new_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- count: string (nullable = true)
 |-- hate_speech: string (nullable = true)
 |-- offensive_language: string (nullable = true)
 |-- neither: integer (nullable = true)
 |-- class: integer (nullable = true)
 |-- tweet: string (nullable = true)



In [14]:
new_df.show(10)

[Stage 4:>                                                          (0 + 1) / 1]

+---+-----+-----------+------------------+-------+-----+--------------------+
|_c0|count|hate_speech|offensive_language|neither|class|               tweet|
+---+-----+-----------+------------------+-------+-----+--------------------+
|  0|    3|          0|                 0|      3|    1|!!! RT @mayasolov...|
|  1|    3|          0|                 3|      0|    0|!!!!! RT @mleew17...|
|  2|    3|          0|                 3|      0|    0|!!!!!!! RT @UrKin...|
|  3|    3|          0|                 2|      1|    0|!!!!!!!!! RT @C_G...|
|  4|    6|          0|                 6|      0|    0|!!!!!!!!!!!!! RT ...|
|  5|    3|          1|                 2|      0|    0|"!!!!!!!!!!!!!!!!...|
|  6|    3|          0|                 3|      0|    0|"!!!!!!""@__Brigh...|
|  7|    3|          0|                 3|      0|    0|!!!!&#8220;@selfi...|
|  8|    3|          0|                 3|      0|    0|""" &amp; you mig...|
|  9|    3|          1|                 2|      0|    0|""" @rhy

                                                                                

In [15]:
# Drop unused column
new_df = new_df.drop("count", "hate_speech", "offensive_language", "neither")

In [16]:
new_df.show(10)

+---+-----+--------------------+
|_c0|class|               tweet|
+---+-----+--------------------+
|  0|    1|!!! RT @mayasolov...|
|  1|    0|!!!!! RT @mleew17...|
|  2|    0|!!!!!!! RT @UrKin...|
|  3|    0|!!!!!!!!! RT @C_G...|
|  4|    0|!!!!!!!!!!!!! RT ...|
|  5|    0|"!!!!!!!!!!!!!!!!...|
|  6|    0|"!!!!!!""@__Brigh...|
|  7|    0|!!!!&#8220;@selfi...|
|  8|    0|""" &amp; you mig...|
|  9|    0|""" @rhythmixx_ :...|
+---+-----+--------------------+
only showing top 10 rows



In [17]:
new_df.groupBy("class").count().show()

[Stage 6:>                                                          (0 + 1) / 1]

+-----+-----+
|class|count|
+-----+-----+
|    1| 4163|
|    0|20620|
+-----+-----+



                                                                                

In [18]:
# DATA Cleaning

In [19]:
import re

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', str(text_string))
    parsed_text = re.sub(giant_url_regex, '', str(parsed_text))
    parsed_text = re.sub(mention_regex, '', str(parsed_text))
    parsed_text = re.sub("[^a-zA-Z:,]+", ' ', str(parsed_text))
    parsed_text = parsed_text.replace('RT', '')
    parsed_text = parsed_text.replace('!', '')
    parsed_text = parsed_text.replace(':', '')
    parsed_text = parsed_text.strip('\'"')
    parsed_text = parsed_text.lower()
    parsed_text = parsed_text.lstrip()
    
    return parsed_text

In [20]:
txt_process_udf = udf(preprocess, StringType())
new_df = new_df.withColumn('tweet', txt_process_udf('tweet'))

In [21]:
new_df = new_df.dropna()

In [22]:
new_df.show(20)

+---+-----+--------------------+
|_c0|class|               tweet|
+---+-----+--------------------+
|  0|    1|as a woman you sh...|
|  1|    0|boy dats cold tyg...|
|  2|    0|dawg   you ever f...|
|  3|    0|she look like a t...|
|  4|    0|the shit you hear...|
|  5|    0|the shit just blo...|
|  6|    0|i can not just si...|
|  7|    0|cause i m tired o...|
|  8|    0|amp you might not...|
|  9|    0|hobbies include f...|
| 10|    0|keeks is a bitch ...|
| 11|    0|murda gang bitch ...|
| 12|    0|so hoes that smok...|
| 13|    0|bad bitches is th...|
| 14|    0|bitch get up off me |
| 15|    0|bitch nigga miss ...|
| 16|    0| bitch plz whatever |
| 17|    0|bitch who do you ...|
| 18|    0|bitches get cut o...|
| 19|    0|black bottle amp ...|
+---+-----+--------------------+
only showing top 20 rows



In [23]:
train_df , test_df = new_df.randomSplit([0.8, 0.2])

## Text Featurization

In [24]:
from pyspark.ml.feature import Tokenizer, Word2Vec
from pyspark.ml import Pipeline

In [25]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="words", outputCol="Features")

#Create Pipeline
w2v_pipeline = Pipeline(stages=[tokenizer, w2v])

w2v_pipeline_model = w2v_pipeline.fit(train_df)
train_df = w2v_pipeline_model.transform(train_df)
test_df = w2v_pipeline_model.transform(test_df)

21/12/05 21:18:50 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/12/05 21:18:51 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [26]:
train_df.show(10)

[Stage 17:>                                                         (0 + 1) / 1]

+-----+-----+--------------------+--------------------+--------------------+
|  _c0|class|               tweet|               words|            Features|
+-----+-----+--------------------+--------------------+--------------------+
|    0|    1|as a woman you sh...|[as, a, woman, yo...|[2.21605230446742...|
|    1|    0|boy dats cold tyg...|[boy, dats, cold,...|[-0.0159893891707...|
|  100|    0|how bout them cow...|[how, bout, them,...|[-0.0075616843532...|
| 1000|    1|mike calls me t b...|[mike, calls, me,...|[-0.0012457987293...|
|10000|    0|he needs too we w...|[he, needs, too, ...|[-0.0080277135923...|
|10002|    0|he only favorites...|[he, only, favori...|[-0.0225900625093...|
|10003|    0|he prolly gone la...|[he, prolly, gone...|[-0.0201731702416...|
|10004|    0|he pussy whipped ...|[he, pussy, whipp...|[-0.0163165788762...|
|10005|    0|he run his mouth ...|[he, run, his, mo...|[-0.0348054950092...|
|10006|    0|  he said bitch boy |[he, said, bitch,...|[-0.0331203057139...|

                                                                                

In [28]:
w2v.save("gs://twitter_testtt/w2v_model1")

                                                                                

In [29]:
#w2v_test = Word2Vec.load("gs://twitter_testtt/w2v_model")

### Training function

In [30]:
test_df.groupby('class').count().toPandas()

                                                                                

Unnamed: 0,class,count
0,1,830
1,0,4095


In [31]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score

In [37]:
train_features = train_df.select("Features").collect()
train_lables = train_df.select("class").collect()
test_features = test_df.select("Features").collect()
test_labels = test_df.select("class").collect()


X_train = np.asarray([v[0].toArray() for v in train_features])
Y_train = np.asarray([v[0] for v in train_lables])
X_test =  np.asarray([v[0].toArray() for v in test_features])
Y_test = np.asarray([v[0] for v in test_labels])

xgbClassifier = xgb.XGBClassifier(max_depth=20, seed=18238, objective='multi:softmax',num_class = 2)
model = xgbClassifier.fit(X_train, Y_train)
pred = model.predict(X_test)

auc_score = accuracy_score(Y_test,pred)
print ("The accuracy score for XGboost model : ",auc_score)



The accuracy score for XGboost model :  0.8976649746192893


# SAVE MODEL PARAMS

In [38]:
model.save_model("model1.bst")

In [39]:
!gsutil cp ./model.bst $MODEL_BUCKET

Copying file://./model.bst [Content-Type=application/octet-stream]...
- [1 files][  1.5 MiB/  1.5 MiB]                                                
Operation completed over 1 objects/1.5 MiB.                                      
