In [1]:
import csv
import numpy as np
import random
import pandas as pd
from collections import Counter
import json

In [2]:
def init_stop_words():
    # Parsing stop words
    file = open('stop_words_english.json', 'r', encoding='utf-8')
    data = file.read()
    stop_words = json.loads(data)

    return stop_words

In [3]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [4]:
class BaselineModel():
    def __init__(self):
        super(BaselineModel, self).__init__()
        self.rep_multipler = {}
        self.demo_multipler = {}


In [5]:
def train(model, batch_size, num_common_words):
    header = ['tweet', 'id', 'conversation_id', 'party']
    train_df = pd.read_csv(
        "train_set.csv",
        names=header,
        index_col=False,
        error_bad_lines = False)

    rep_counter = Counter()
    demo_counter = Counter()

    # Excluding headers from list of tweet
    tweet_list = train_df.values.tolist()

    # Dividing list of tweets into batches 
    num_batches = 0
    tweet_batch = list(chunks(tweet_list[0:30000], batch_size))
    total_batches = len(tweet_batch)

    for batch in tweet_batch:
        for tweet in batch:
            # Replacing Typesetter's apostrophe with Typewriter apostrophe
            sentence = tweet[0].lower().replace("’", "'").split()
            
            if tweet[3] == "R":
                rep_counter = rep_counter + Counter(sentence)
            elif tweet[3] == "D":
                demo_counter = demo_counter + Counter(sentence)
        num_batches += 1
        print("Training Progression: {:0.2f}%".format((num_batches/total_batches)*100))


    stop_words = init_stop_words()

    # Exclude stop words from analysis
    for word in stop_words:
        if word in rep_counter:
            del rep_counter[word]
        
        if word in demo_counter:
            del demo_counter[word]

    # Only focus on the top 20 most common words
    rep_common_words = rep_counter.most_common(num_common_words)
    demo_common_words = demo_counter.most_common(num_common_words)

    rep_total_words = 0
    demo_total_words = 0

    for word in rep_common_words:
        rep_total_words += word[1]
        
    for word in demo_common_words:
        demo_total_words += word[1]

    # Assign multipler scores to each word
    rep_multipler = {}
    demo_multipler = {}

    for word in rep_common_words:
        rep_multipler[word[0]] = word[1]/rep_total_words
        
    for word in demo_common_words:
        demo_multipler[word[0]] = word[1]/demo_total_words

    model.rep_multipler = rep_multipler
    model.demo_multipler = demo_multipler


In [6]:
model = BaselineModel()
train(model, 1000, 150)

  if (await self.run_code(code, result,  async_=asy)):
Training Progression: 3.33%
Training Progression: 6.67%
Training Progression: 10.00%
Training Progression: 13.33%
Training Progression: 16.67%
Training Progression: 20.00%
Training Progression: 23.33%
Training Progression: 26.67%
Training Progression: 30.00%
Training Progression: 33.33%
Training Progression: 36.67%
Training Progression: 40.00%
Training Progression: 43.33%
Training Progression: 46.67%
Training Progression: 50.00%
Training Progression: 53.33%
Training Progression: 56.67%
Training Progression: 60.00%
Training Progression: 63.33%
Training Progression: 66.67%
Training Progression: 70.00%
Training Progression: 73.33%
Training Progression: 76.67%
Training Progression: 80.00%
Training Progression: 83.33%
Training Progression: 86.67%
Training Progression: 90.00%
Training Progression: 93.33%
Training Progression: 96.67%
Training Progression: 100.00%


In [15]:
def test(model, batch_size):
    header = ['tweet', 'id', 'conversation_id', 'party']
    test_df = pd.read_csv(
        "test_set.csv",
        names=header,
        index_col=False,
        error_bad_lines = False)

    # Excluding headers from list of tweet
    tweet_list = test_df.values.tolist()

    rep_multipler = model.rep_multipler
    demo_multipler = model.demo_multipler

    # Dividing list of tweets into batches 
    num_batches = 0
    tweet_batch = list(chunks(tweet_list, batch_size))
    total_batches = len(tweet_batch)

    test_acc = []

    acc = 0
    total = 0
    unclassified_tweets = 0

    for batch in tweet_batch:
        for tweet in batch:
            # Replacing Typesetter's apostrophe with Typewriter apostrophe
            sentence = tweet[0].lower().replace("’", "'").split()

            party = None
            rep_score = 0
            demo_score = 0

            for word in sentence:
                if word in rep_multipler:
                    rep_score += rep_multipler[word]
                if word in demo_multipler: 
                    demo_score += demo_multipler[word]

            if rep_score > demo_score:
                party = "R"
            elif rep_score < demo_score: 
                party = "D"
            else:
                party = "unclassified"
                unclassified_tweets += 1

            acc += int(tweet[3] == party)
            total += 1

        num_batches += 1
        print("Testing Progression: {:0.2f}%".format((num_batches/total_batches)*100))
        test_acc.append(acc/total)

    print("\n\nTest Accuracy assuming unclassified tweets are all incorrect: {:0.2f}%".format(test_acc[-1]*100))
    # print("Number of unclassified tweets: {}".format(unclassified_tweets))
    acc += unclassified_tweets/2

    test_acc.append(acc/total)

    print("Final Test Accuracy assuming half of the unclassified tweets are correctly classified: {:0.2f}%".format(test_acc[-1]*100))

In [16]:
test(model, 1000)

Testing Progression: 0.40%
Testing Progression: 0.80%
Testing Progression: 1.20%
Testing Progression: 1.61%
Testing Progression: 2.01%
Testing Progression: 2.41%
Testing Progression: 2.81%
Testing Progression: 3.21%
Testing Progression: 3.61%
Testing Progression: 4.02%
Testing Progression: 4.42%
Testing Progression: 4.82%
Testing Progression: 5.22%
Testing Progression: 5.62%
Testing Progression: 6.02%
Testing Progression: 6.43%
Testing Progression: 6.83%
Testing Progression: 7.23%
Testing Progression: 7.63%
Testing Progression: 8.03%
Testing Progression: 8.43%
Testing Progression: 8.84%
Testing Progression: 9.24%
Testing Progression: 9.64%
Testing Progression: 10.04%
Testing Progression: 10.44%
Testing Progression: 10.84%
Testing Progression: 11.24%
Testing Progression: 11.65%
Testing Progression: 12.05%
Testing Progression: 12.45%
Testing Progression: 12.85%
Testing Progression: 13.25%
Testing Progression: 13.65%
Testing Progression: 14.06%
Testing Progression: 14.46%
Testing Progress