In [2]:
import pyspark

In [46]:
import re
from nltk.stem import PorterStemmer

In [54]:
from pyspark.ml.feature import *

In [43]:
f = open('stopwords.txt','r')
stop = f.read().split(',')

In [51]:
global stopwords
stopwords = {}
for word in stop:
    stopwords[word] = True

In [3]:
sc = pyspark.SparkContext.getOrCreate()

# load text files from multiple folders as an rdd

In [106]:
text_file = sc.textFile("data/Sports,data/Politics,data/Science,data/Business")

In [68]:
def stopWords(word):
    stemmer = PorterStemmer()
    try:
        if stopwords[word]:
            return False #this word should be abandoned when return true
    except KeyError:
        return True

In [92]:
def splitwords(line):
    words = line.split(" ")
    returnwords = []
    stemmer = PorterStemmer()
    for word in words:
        if(re.match('http.*',word)):
                continue
        word = ''.join(re.findall(r'[0-9a-zA-Z]*', word))
        if(len(word)>10):
            continue
        if(word.isdigit()):
            continue
        word = word.lower()
        word = stemmer.stem(word)
        returnwords.append(word)
    return returnwords

In [47]:
def mapper(word):
    return (word,1)

## compute the word count for all articals and extract the top 1000 words as features

In [107]:
counts = text_file.flatMap(splitwords).filter(stopWords).map(mapper).reduceByKey(lambda a, b: a + b).\
takeOrdered(1000, key=lambda x: -x[1])

In [154]:
global feature
feature = {}
for i in range(len(counts)):
    feature[counts[i][0]] = i

# read files into pandas dataframe

In [129]:
import os
import pandas as pd
import numpy as np

In [127]:
label_list = os.listdir("data")

In [135]:
def textdf(list_, folder,label):
    t = []
    for i in range(len(list_)):
        filename = "data/"+folder + "/" + list_[i]
        f = open(filename,"r",encoding = "utf-8")
        t.append(f.read())
    df = pd.DataFrame({"content":t, "label":[label]*len(t)})
    return df

In [132]:
label_list  

['Politics', 'Sports', 'Science', 'Business']

### get labels from the folder names

In [136]:
label = label_list[0]
file_list = os.listdir(f"data/{label}")
Politics_df = textdf(file_list,label,0)

In [137]:
label = label_list[1]
file_list = os.listdir(f"data/{label}")
Sports_df = textdf(file_list,label,1)

label = label_list[2]
file_list = os.listdir(f"data/{label}")
Science_df = textdf(file_list,label,2)

label = label_list[3]
file_list = os.listdir(f"data/{label}")
Business_df = textdf(file_list,label,3)

### randomly seperate data into train set and test set

In [138]:
resultdf = Science_df.append([Business_df, Politics_df, Sports_df],ignore_index=True)

In [142]:
resultdf = resultdf.reindex(np.random.permutation(resultdf.index))

In [171]:
twenty = round(len(resultdf)/5)
train_set = resultdf[0:(twenty*3)]
test_set = resultdf[(twenty*3):]

## transform pandas dataframe into spark dataframe

In [147]:
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

In [213]:
df_train = sqlCtx.createDataFrame(train_set)

In [212]:
df_test = sqlCtx.createDataFrame(test_set)

In [177]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

## for words in one artical, once a word in feature appears, the feature vector +=1

In [163]:
def feature_ab(line):
    f = [0]*1000
    text = line[0]
    label = line[1]
    stemmer = PorterStemmer()
    words = text.lower().split(" ")
    for i in words:
        i = stemmer.stem(i)
        try:
            num = feature[i]
            f[num] +=1 #or just = 1
        except KeyError:
            continue
    return LabeledPoint(label,f)

In [214]:
train_data = df_train.rdd.map(feature_ab)

# Logistic regression

In [178]:
lrm = LogisticRegressionWithLBFGS.train(train_data, iterations=100, numClasses=4)

In [181]:
labelsAndPreds = train_data.map(lambda p: (p.label, lrm.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(train_data.count())
print("trainning Accuracy = " + str((1-trainErr)*100)+"%")

trainning Accuracy = 98.38308457711443%


In [218]:
test_data = df_test.rdd.map(feature_ab)

In [187]:
labelsAndPreds_test = test_data.map(lambda p: (p.label, lrm.predict(p.features)))
testErr = labelsAndPreds_test.filter(lambda lp: lp[0] != lp[1]).count() / float(test_data.count())
print("testing Accuracy = " + str((1-testErr)*100)+"%")

testing Accuracy = 94.83830845771143%


# Naive Bayes

In [210]:
from pyspark.mllib.classification import NaiveBayes

In [215]:
model = NaiveBayes.train(train_data, 1.0)

In [216]:
labelsAndPreds_bayes = train_data.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds_bayes.filter(lambda lp: lp[0] != lp[1]).count() / float(train_data.count())
print("trainning Accuracy = " + str((1-trainErr)*100)+"%")

trainning Accuracy = 96.41376451077943%


In [219]:
labelsAndPreds_test_bayes = test_data.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds_test_bayes.filter(lambda lp: lp[0] != lp[1]).count() / float(test_data.count())
print("testing Accuracy = " + str((1-testErr)*100)+"%")

testing Accuracy = 96.23756218905473%
