In [1]:
from pyspark.context import  SparkContext
from pyspark.sql.functions import col, lower
from pyspark.sql import SQLContext
import re
sc = SparkContext('local','test1')
sql = SQLContext(sc)

In [2]:
from pyspark.sql.functions import lit
dems_df =  sql.read.text("dems.txt")
gop_df = sql.read.text("gop.txt")

In [3]:
corpus_df = dems_df.select("value", lit(1).alias("label")).union(gop_df.select("value", lit(0).alias("label")))

In [4]:
corpus_df.select("*").limit(20).show()

+--------------------+-----+
|               value|label|
+--------------------+-----+
|This week @senate...|    1|
|Health care profe...|    1|
|RT @SeemaNanda: G...|    1|
|Republicans keep ...|    1|
|RT @SpeakerPelosi...|    1|
|While the preside...|    1|
|You are not alone...|    1|
|RT @DNCWarRoom: W...|    1|
|RT @DNCWarRoom: T...|    1|
|RT @DNCWarRoom: T...|    1|
|LISTEN. TO. HEALT...|    1|
|RT @SeemaNanda: B...|    1|
|This is a HUGE wi...|    1|
|RT @SenSherrodBro...|    1|
|RT @WisDems: Make...|    1|
|RT @DemConvention...|    1|
|Abortion is healt...|    1|
|RT @RepLucyMcBath...|    1|
|Get counted. Get ...|    1|
+--------------------+-----+



In [5]:
from pyspark.sql.functions import udf,lower,col,trim
from pyspark.sql.types import FloatType,StringType,IntegerType
def clean_text(text):
    text=re.sub(r'@[A-Za-z0-9]+','',text).strip() #remove mentions
    text=re.sub(r'#','',text).strip() #removing #symbol
    text=re.sub(r'RT[\s]+','',text).strip()
    text=re.sub(r'[?|$|.|!|;|:|&|"|,|"|"|*|-|(|)]','',text).strip()
    text=re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)+',"",text).strip()
    return text.strip()

In [6]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string).strip()

In [7]:
#value
clean_udf_str=udf(lambda z: clean_text(z), StringType())
corpus_df=corpus_df.select("label",clean_udf_str("value").alias("value"))
emoji_udf_str=udf(lambda z: remove_emoji(z), StringType())
corpus_df=corpus_df.select("label",emoji_udf_str('value').alias('value'))
corpus_df=corpus_df.select(trim(lower(col('value'))).alias("value"),"label")

In [8]:
#After Preprocessing
corpus_df.select("*").limit(20).show()

+--------------------+-----+
|               value|label|
+--------------------+-----+
|this week  said w...|    1|
|health care profe...|    1|
|good to see  sign...|    1|
|republicans keep ...|    1|
|the congress has ...|    1|
|while the preside...|    1|
|you are not alone...|    1|
|well this is conc...|    1|
|trump “in the end...|    1|
|trump proposed hu...|    1|
|listen to health ...|    1|
|breaking we  alon...|    1|
|this is a huge wi...|    1|
|update this is th...|    1|
|make sure your vo...|    1|
|in light of the u...|    1|
|abortion is healt...|    1|
|why does completi...|    1|
|get counted get c...|    1|
+--------------------+-----+



In [9]:
train_df, test_df = corpus_df.randomSplit([0.75, 0.25])

In [34]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, Tokenizer, StopWordsRemover

tokenizer = Tokenizer(inputCol="value", outputCol="words")
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="words_cleaned")
vectorizer = CountVectorizer(inputCol="words_cleaned", outputCol="features")
cleaning_pipeline = Pipeline(stages = [tokenizer,stop_words_remover,vectorizer])
cleaning_pipeline_model = cleaning_pipeline.fit(corpus_df)
cleaned_training_df = cleaning_pipeline_model.transform(train_df)
cleaned_testing_df = cleaning_pipeline_model.transform(test_df)

In [10]:
train_df.show()

+--------------------+-----+
|               value|label|
+--------------------+-----+
|                    |    1|
|'read the transcr...|    1|
|'s actions on dac...|    1|
|'s campaign has s...|    1|
|'s leadership is ...|    1|
|'s role in our na...|    1|
|'we have to not b...|    1|
|- a personal than...|    1|
|-- out of many we...|    1|
|-- the same group...|    1|
|-creating good-pa...|    1|
|1 day of trump he...|    1|
|1 in 10 americans...|    1|
|1 in 3 college st...|    1|
|1 in 3 women expe...|    1|
|1 in 3 women worl...|    1|
|1 in 5 adults in ...|    1|
|1 in 5 lgbtq amer...|    1|
|1 in 5 women and ...|    1|
|1 in 8   that’s h...|    1|
+--------------------+-----+
only showing top 20 rows



In [11]:
cleaned_training_df.show()

+--------------------+-----+--------------------+--------------------+--------------------+
|               value|label|               words|       words_cleaned|            features|
+--------------------+-----+--------------------+--------------------+--------------------+
|                    |    1|                  []|                  []|   (58092,[0],[1.0])|
|                    |    1|                  []|                  []|   (58092,[0],[1.0])|
|'emergency' - an ...|    1|['emergency', -, ...|['emergency', -, ...|(58092,[18,23,46,...|
|'read the transcr...|    1|['read, the, tran...|['read, transcrip...|(58092,[0,2,3,180...|
|'s campaign has s...|    1|['s, campaign, ha...|['s, campaign, st...|(58092,[0,12,24,1...|
|'s campaign says ...|    1|['s, campaign, sa...|['s, campaign, sa...|(58092,[0,119,179...|
|'s leadership is ...|    1|['s, leadership, ...|['s, leadership, ...|(58092,[0,41,81,1...|
|'we have to not b...|    1|['we, have, to, n...|['we, afraid, cen...|(58092,[15

In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Word2Vec, Tokenizer, StopWordsRemover

tokenizer = Tokenizer(inputCol="value", outputCol="words")
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="words_cleaned")
word2vectorizer = Word2Vec(inputCol="words_cleaned", outputCol="features")
cleaning_pipeline = Pipeline(stages = [tokenizer,stop_words_remover,word2vectorizer])
cleaning_pipeline_model = cleaning_pipeline.fit(corpus_df)
cleaned_training_w2v_df = cleaning_pipeline_model.transform(train_df)
cleaned_testing_w2v_df = cleaning_pipeline_model.transform(test_df)

In [13]:
cleaned_training_df.show()

+--------------------+-----+--------------------+--------------------+--------------------+
|               value|label|               words|       words_cleaned|            features|
+--------------------+-----+--------------------+--------------------+--------------------+
|                    |    1|                  []|                  []|   (58092,[0],[1.0])|
|                    |    1|                  []|                  []|   (58092,[0],[1.0])|
|'emergency' - an ...|    1|['emergency', -, ...|['emergency', -, ...|(58092,[18,23,46,...|
|'read the transcr...|    1|['read, the, tran...|['read, transcrip...|(58092,[0,2,3,180...|
|'s campaign has s...|    1|['s, campaign, ha...|['s, campaign, st...|(58092,[0,12,24,1...|
|'s campaign says ...|    1|['s, campaign, sa...|['s, campaign, sa...|(58092,[0,119,179...|
|'s leadership is ...|    1|['s, leadership, ...|['s, leadership, ...|(58092,[0,41,81,1...|
|'we have to not b...|    1|['we, have, to, n...|['we, afraid, cen...|(58092,[15

In [14]:
from pyspark.ml.classification import NaiveBayes,LogisticRegression,OneVsRest
naive_bayes = NaiveBayes(featuresCol="features", labelCol = "label")

In [15]:
naive_bayes_model = naive_bayes.fit(cleaned_training_df)
predictions_df = naive_bayes_model.transform(cleaned_testing_df)

In [16]:
predictions_df.select("features","label","prediction").limit(20).show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(58092,[1,204,247...|    1|       1.0|
|(58092,[0,4,203,2...|    1|       1.0|
|(58092,[0,51,69,7...|    1|       1.0|
|(58092,[242,247,2...|    1|       0.0|
|(58092,[18,109,12...|    1|       0.0|
|(58092,[0,76,85,4...|    1|       0.0|
|(58092,[0,4,49,69...|    1|       0.0|
|(58092,[10,11,12,...|    1|       1.0|
|(58092,[8,47,237,...|    1|       0.0|
|(58092,[0,35,88,1...|    1|       1.0|
|(58092,[4,20,26,2...|    1|       1.0|
|(58092,[0,13,14,4...|    1|       1.0|
|(58092,[1,10,11,3...|    1|       1.0|
|(58092,[2,7,10,11...|    1|       1.0|
|(58092,[534,1586,...|    1|       0.0|
|(58092,[0,75,93,2...|    1|       0.0|
|(58092,[31,38,43,...|    1|       1.0|
|(58092,[0,2,4,31,...|    1|       1.0|
|(58092,[4,5,13,14...|    1|       1.0|
|(58092,[0,9,12,84...|    1|       1.0|
+--------------------+-----+----------+



In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
eval = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction', metricName = 'accuracy')
eval.evaluate(predictions_df)

0.8806280544868462

In [18]:
cleaned_training_df.select("features").show(4)

+--------------------+
|            features|
+--------------------+
|   (58092,[0],[1.0])|
|   (58092,[0],[1.0])|
|(58092,[18,23,46,...|
|(58092,[0,2,3,180...|
+--------------------+
only showing top 4 rows



In [19]:
log_reg = LogisticRegression(featuresCol="features", labelCol = "label")
ovr=OneVsRest(classifier=log_reg)
log_reg_model = ovr.fit(cleaned_training_w2v_df)
predictions_w2v_df = log_reg_model.transform(cleaned_testing_w2v_df)

In [20]:
predictions_w2v_df.select("features","label","prediction").limit(20).show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[-0.0805224822834...|    1|       1.0|
|[0.04063622681424...|    1|       1.0|
|[0.04199206481687...|    1|       1.0|
|[-0.0104490743357...|    1|       0.0|
|[-0.0202217319145...|    1|       0.0|
|[-0.0077337145128...|    1|       1.0|
|[0.04307432472705...|    1|       0.0|
|[-0.0718544282502...|    1|       1.0|
|[-0.0582980086597...|    1|       0.0|
|[0.02671754105637...|    1|       1.0|
|[0.04417990366928...|    1|       1.0|
|[0.04396477723984...|    1|       1.0|
|[0.08219505460275...|    1|       1.0|
|[0.09237739774248...|    1|       1.0|
|[-0.0298887882381...|    1|       0.0|
|[-0.0670479329419...|    1|       0.0|
|[0.07550884227213...|    1|       1.0|
|[0.13333379157951...|    1|       0.0|
|[0.03884221834165...|    1|       1.0|
|[0.06538315007791...|    1|       1.0|
+--------------------+-----+----------+



In [21]:
eval.evaluate(predictions_w2v_df)

0.7755017157117604

In [35]:
corpus_df=corpus_df.toPandas()
#corpus_df.to_csv('tweets_corpus.csv',ignore_index=True)
def add_prefix(colum):
    return "__label__"+str(colum)
corpus_df["value"]=corpus_df["label"].apply(add_prefix)+" "+corpus_df["value"]


In [70]:
#corpus_df['value'] = corpus_df['value'].apply(lambda x: x.split())

In [36]:
corpus_df

Unnamed: 0,value,label
0,__label__1 this week said workers don’t need ...,1
1,__label__1 health care professionals are on th...,1
2,__label__1 good to see signal a change to its...,1
3,__label__1 republicans keep admitting that vot...,1
4,__label__1 the congress has so far passed thre...,1
...,...,...
38346,__label__0 i voted in favor of the iran accoun...,0
38347,__label__0 congratulations to for winning two...,0
38348,__label__0 “lift the cuban embargo” via http...,0
38349,__label__0 a huge congrats to for making ’s l...,0


In [37]:
corpus_df['value'].to_csv("tweets_corpus.csv",index=False)

  """Entry point for launching an IPython kernel.


In [38]:
import fasttext
model = fasttext.train_supervised("tweets_corpus.csv")

In [39]:
train_df=train_df.toPandas()
test_df=test_df.toPandas()

In [42]:
train_df["value"]=train_df["label"].apply(add_prefix)+" "+train_df["value"]
test_df["value"]=test_df["label"].apply(add_prefix)+" "+test_df["value"]

In [44]:
train_df['value'].to_csv("tweets_train_corpus.csv",index=False)
test_df['value'].to_csv("tweets_test_corpus.csv",index=False)

  """Entry point for launching an IPython kernel.
  


In [45]:
model.labels

['__label__1', '__label__0']

In [46]:
model = fasttext.train_supervised('tweets_train_corpus.csv')

In [47]:
model.labels

['__label__1', '__label__0']

In [48]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p #Precision
    print("R@{}\t{:.3f}".format(1, r)) #Recall

In [49]:
print_results(*model.test('tweets_test_corpus.csv'))

N	9667
P@1	0.874
R@1	0.874
