In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import sklearn 
import string
import warnings
import re # helps you filter urls
from scipy import sparse
from IPython.display import display, Latex, Markdown
warnings.filterwarnings('ignore')

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
# Verify that the following commands work for you, before moving on.

lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')

In [None]:
nltk.download('punkt')

In [4]:
posMapping = {
# "First_Letter by nltk.pos_tag":"POS_for_lemmatizer"
    "N":'n',
    "V":'v',
    "J":'a',
    "R":'r'
}
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    lower_case_text = text.lower()
    lower_case_text = lower_case_text.replace("'s", "")
    #lower_case_text = lower_case_text.replace("‚Äôs", "")
    #lower_case_text = lower_case_text.replace("‚Äô", "'")
    lower_case_text = lower_case_text.replace("'", "")
    http_pattern = r'https?://\S+'
    lower_case_text = re.sub(http_pattern, '', lower_case_text)
    lower_case_text = lower_case_text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    tokens = nltk.word_tokenize(lower_case_text)
    tokens = nltk.pos_tag(tokens)
    token_list = []
    for token in tokens:
        try:
            lem = lemmatizer.lemmatize(token[0],pos=posMapping.get(token[1][0:1],'n'))
            token_list.append(lem)
        except:
            None
    return token_list

In [6]:
tweets = pd.read_csv("tweets_train.csv", na_filter=False)
display(tweets.head())

Unnamed: 0,screen_name,text
0,GOP,RT @GOPconvention: #Oregon votes today. That m...
1,TheDemocrats,RT @DWStweets: The choice for 2016 is clear: W...
2,HillaryClinton,Trump's calling for trillion dollar tax cuts f...
3,HillaryClinton,.@TimKaine's guiding principle: the belief tha...
4,timkaine,Glad the Senate could pass a #THUD / MilCon / ...


In [7]:
def process_all(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    processed_df = df.copy(deep=True)
    processed_df['text'] = processed_df['text'].apply(lambda x: process(x,lemmatizer))
    return processed_df

In [8]:
processed_tweets = process_all(tweets)
print(processed_tweets.head())

      screen_name                                               text
0             GOP  [rt, gopconvention, oregon, vote, today, that,...
1    TheDemocrats  [rt, dwstweets, the, choice, for, 2016, be, cl...
2  HillaryClinton  [trump, call, for, trillion, dollar, tax, cut,...
3  HillaryClinton  [timkaine, guide, principle, the, belief, that...
4        timkaine  [glad, the, senate, could, pass, a, thud, milc...


In [9]:
# 11% credits
def create_features(processed_tweets, stop_words):
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=lambda text:text, min_df=2, analyzer = 'word', stop_words=stop_words, lowercase=False)
    vectorizer = vectorizer.fit(processed_tweets['text'])
    tfidf_matrix = vectorizer.transform(processed_tweets['text'])
    return vectorizer,tfidf_matrix
    

In [10]:
processed_stopwords = list(np.concatenate([process(word) for word in stopwords]))
(tfidf, X) = create_features(processed_tweets, processed_stopwords)
tfidf, X

(TfidfVectorizer(lowercase=False, min_df=2,
                 stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                             'ourselves', 'you', 'youre', 'youve', 'youll',
                             'youd', 'your', 'yours', 'yourself', 'yourselves',
                             'he', 'him', 'his', 'himself', 'she', 'she', 'her',
                             'hers', 'herself', 'it', 'it', 'it', 'itself', ...],
                 tokenizer=<function create_features.<locals>.<lambda> at 0x1285d9440>),
 <17298x8116 sparse matrix of type '<class 'numpy.float64'>'
 	with 169166 stored elements in Compressed Sparse Row format>)

In [11]:
def create_labels(processed_tweets):
    class_labels = []
    republics = ["realDonaldTrump", "mike_pence" , "GOP"]
    for screen_name in processed_tweets['screen_name']:
        if screen_name in republics:
            class_labels.append(0)
        else:
            class_labels.append(1)
    return np.array(class_labels)

In [None]:
y = create_labels(processed_tweets)
y

In [13]:
from sklearn.metrics import accuracy_score

In [15]:
class MajorityLabelClassifier():
    def __init__(self):
        self.mode = None

    def fit(self, X, y):
        self.mode = np.argmax(np.bincount(y))

    def predict(self, X):
        return np.full(X.shape[0], self.mode)


baselineClf = MajorityLabelClassifier()
baselineClf.fit(X,y)
predicted_labels = baselineClf.predict(X)
training_accuracy = accuracy_score(y, predicted_labels)
print("Accuracy:", training_accuracy)

Accuracy: 0.5001734304543878


In [16]:
def learn_classifier(X_train, y_train, kernel):
    classifier = sklearn.svm.SVC(kernel=kernel)
    classifier.fit(X_train, y_train)
    return classifier

In [17]:
classifier = learn_classifier(X, y, 'linear')

In [18]:
def evaluate_classifier(classifier, X_validation, y_validation):
    predicted_labels = classifier.predict(X_validation)
    training_accuracy = accuracy_score(y_validation, predicted_labels)
    return training_accuracy

In [19]:
accuracy = evaluate_classifier(classifier, X, y)
print(accuracy)

0.9544456006474737


In [20]:
kf = sklearn.model_selection.KFold(n_splits=4, random_state=1, shuffle=True)
kf

KFold(n_splits=4, random_state=1, shuffle=True)

In [33]:
def best_model_selection(kf, X, y):
    best_kernel = ""
    best_accuracy = 0
    for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
        kernel_accuracies = []
        for train_index, val_index in kf.split(X):
            X_train,X_val = X[train_index], X[val_index]
            y_train,y_val = y[train_index], y[val_index]
            classifier = learn_classifier(X_train, y_train, kernel)
            accuracy = evaluate_classifier(classifier, X_val, y_val)
            kernel_accuracies.append(accuracy)
        print("{} : {}".format(kernel,kernel_accuracies))
        avg_accuracy = np.mean(kernel_accuracies)
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_kernel = kernel

    return best_kernel
best_kernel = best_model_selection(kf, X, y)
best_kernel

linear : [0.9024277456647399, 0.9079768786127168, 0.9056429232192414, 0.90263644773358]
rbf : [0.9102890173410405, 0.9116763005780347, 0.9100370027752082, 0.9144310823311749]
poly : [0.909364161849711, 0.9172254335260116, 0.919981498612396, 0.9216003700277521]
sigmoid : [0.8931791907514451, 0.9021965317919075, 0.8975485661424607, 0.8970860314523589]


'poly'

In [31]:
def classify_tweets(tfidf, classifier, unlabeled_tweets):
    processed_test_data = process_all(unlabeled_tweets)
    testdata_csr_matrix = tfidf.transform(processed_test_data['text'])
    predicted_labels = classifier.predict(testdata_csr_matrix)
    return predicted_labels

In [32]:
classifier = learn_classifier(X, y, best_kernel)
unlabeled_tweets = pd.read_csv("tweets_test.csv", na_filter=False)
y_pred = classify_tweets(tfidf, classifier, unlabeled_tweets)
print(y_pred)

[1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0
 0 1 1 0 0 0 1 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 0 1 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 0 1
 0 1 0 1 0 0 1 1 0 1 0 1 1 0 0 0 1 1 1 1 0 1 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0
 0 1 0 1 1 1 1 0 1 1 1 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 1 1 1 0 0 0 0 1 1 0
 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 1 1 1 0 1
 0 0 0 0 1 1 1 1 1 1 0 0 1 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 1 1
 1 0 1 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 1 1 1 0 0 1 0 1 0
 1 0 1 1 1 1 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 1 0 1
 0 0 1 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 0 1
 1 0 1 1 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 1 1 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 1 1
 0 0 1 0 0 1 0 0 0 1 0 1 

In [35]:
import openai
import time
from openai import OpenAI
np.random.seed(4200)

In [36]:
with open('openai-key.txt', 'r') as file:
    openai_key = file.read().rstrip()

In [37]:
client = OpenAI(api_key=openai_key)

In [38]:
def chatGPT(client, input_string, prompt="You are a helpful assistant.", model="gpt-3.5-turbo-0125"):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
              "role": "system",
              "content": prompt
            },
            {
              "role": "user",
              "content": input_string
            }
        ],
        temperature=0.7,
        max_tokens=2048,
        top_p=1
    )
    return response.choices[0].message.content

In [39]:
def generate_prompt(initial_prompt, tweets, length_limit):
    prompt = initial_prompt
    index = 0
    for tweet in tweets:
        if len(prompt) + len(tweet) + 5 < length_limit:
            index = index + 1
            prompt = prompt + " \n{}. ".format(index) + tweet
    return prompt, index

In [40]:
predict_initial_prompt = ("For each of the following tweets, predict the political inclination of the tweet.Label a single character 'D' if the tweet is democrat or 'R' if it's a republican. The label should be either 'D' or 'R' only")

some_tweets = ("Trump's calling for trillion dollar tax cuts for Wall Street. It's time for them to pay their fair share.",
          "Obama is losing credibility with Syrian opposition leaders https://t.co/bANRRu2ktN",
          "Positive relationships between faith groups &amp; law enforcement build more resilient communities https://t.co/6pfrSKVE72",
          "Tune in now to watch @JoeBiden hit the trail for Hillary in Ohio: https://t.co/FjCws9BTYy",
          "Happy Birthday to a great President of the United States, George H. W. Bush! https://t.co/YOIB2eFfHG")

prompt_predict, count = generate_prompt(predict_initial_prompt, some_tweets, 2048)
display(prompt_predict)
chatGPT(client, prompt_predict,model = "gpt-3.5-turbo")

"For each of the following tweets, predict the political inclination of the tweet.Label a single character 'D' if the tweet is democrat or 'R' if it's a republican. The label should be either 'D' or 'R' only \n1. Trump's calling for trillion dollar tax cuts for Wall Street. It's time for them to pay their fair share. \n2. Obama is losing credibility with Syrian opposition leaders https://t.co/bANRRu2ktN \n3. Positive relationships between faith groups &amp; law enforcement build more resilient communities https://t.co/6pfrSKVE72 \n4. Tune in now to watch @JoeBiden hit the trail for Hillary in Ohio: https://t.co/FjCws9BTYy \n5. Happy Birthday to a great President of the United States, George H. W. Bush! https://t.co/YOIB2eFfHG"

'1. D\n2. R\n3. D\n4. D\n5. R'

In [41]:
# replace the model name of your choice
model_name = "gpt-3.5-turbo"

In [42]:
def generate_prompts(initial_prompt, tweets, length_limit):
    prompts = []
    while(len(tweets) > 0):
        [prompt, count] = generate_prompt(initial_prompt, tweets, length_limit)
        prompts.append(prompt)
        tweets = tweets[count:]
        
    return prompts
    
sample_tweets = tweets.sample(50)
prompts = generate_prompts(predict_initial_prompt, sample_tweets["text"].to_list(), 2048)
prompts

["For each of the following tweets, predict the political inclination of the tweet.Label a single character 'D' if the tweet is democrat or 'R' if it's a republican. The label should be either 'D' or 'R' only \n1. RT @GovPenceIN: .@INDOT has been hard at work ensuring the Crossroads of America has the infrastructure to back that moniker up http://t.co‚Ä¶ \n2. Today at 4:15 PM ET: @JoeKennedy &amp; @PrattWiley answer your #VoterRegistrationDay questions. Share your question her‚Ä¶ https://t.co/EbTaRpkXRK \n3. RT @MSNBC: The First in the South Democratic Candidates Forum is live now! Stream it live at https://t.co/qBv7KnZBk0 #MSNBC2016 https://t.c‚Ä¶ \n4. NEBRASKA #VoteTrump TODAY!\n#MakeAmericaGreatAgain #Trump2016\nhttps://t.co/hGbesTbQci \n5. Bush administration foreign policy positions =\nForeign policy positions at #GOPDebate\n#TransformationTuesday? https://t.co/KOiigTggcf \n6. Democrats will continue to stand with women and fight for their right to a safe and legal abortion. https

In [None]:
def response_to_predictions(response_string):
    predictions = []
    
    results = response_string.split('\n')
    for res in results:
        predictions.append(res.split('.')[-1].strip())

    return predictions

In [47]:
def tweets_to_predictions(predict_initial_prompt, tweet_list, model_name='gpt-3.5-turbo-0125', verbose=False):
    all_predictions = []
    prompt_list = generate_prompts(predict_initial_prompt, tweet_list, 2048)
    for prompt in prompt_list:
        if verbose:
            print("Processing prompt: {}".format(prompt))
        response = chatGPT(client, prompt, prompt=predict_initial_prompt, model=model_name)
        time.sleep(10)
        predictions = response_to_predictions(response)
        all_predictions = all_predictions + predictions
    return all_predictions


all_predictions = tweets_to_predictions(predict_initial_prompt, sample_tweets["text"].to_list(), model_name=model_name, verbose=True)

all_predictions

Processing prompt: For each of the following tweets, predict the political inclination of the tweet.Label a single character 'D' if the tweet is democrat or 'R' if it's a republican. The label should be either 'D' or 'R' only 
1. RT @GovPenceIN: .@INDOT has been hard at work ensuring the Crossroads of America has the infrastructure to back that moniker up http://t.co‚Ä¶ 
2. Today at 4:15 PM ET: @JoeKennedy &amp; @PrattWiley answer your #VoterRegistrationDay questions. Share your question her‚Ä¶ https://t.co/EbTaRpkXRK 
3. RT @MSNBC: The First in the South Democratic Candidates Forum is live now! Stream it live at https://t.co/qBv7KnZBk0 #MSNBC2016 https://t.c‚Ä¶ 
4. NEBRASKA #VoteTrump TODAY!
#MakeAmericaGreatAgain #Trump2016
https://t.co/hGbesTbQci 
5. Bush administration foreign policy positions =
Foreign policy positions at #GOPDebate
#TransformationTuesday? https://t.co/KOiigTggcf 
6. Democrats will continue to stand with women and fight for their right to a safe and legal abortion

['R',
 'D',
 'D',
 'R',
 'R',
 'D',
 'R',
 'R',
 'D',
 'D',
 'D',
 'D',
 'R',
 'D',
 'R',
 'R',
 'R',
 'D',
 'D',
 'D',
 'R',
 'R',
 'R',
 'D',
 'R',
 'R',
 'D',
 'D',
 'R',
 'D',
 'D',
 'D',
 'R',
 'R',
 'R',
 'D',
 'R',
 'D',
 'R',
 'R',
 'D',
 'D',
 'R',
 'R',
 'D',
 'R',
 'D',
 'R',
 'D',
 'R']

In [45]:
train_tweets_sampled, test_tweets_sampled = sklearn.model_selection.train_test_split(tweets, test_size=0.05, random_state=4200)

In [46]:
# processing train data
train_tweets_sampled_processed = process_all(train_tweets_sampled)
# creating features from train data
(tfidf_1, X_train_sampled) = create_features(train_tweets_sampled_processed, processed_stopwords)
# creating output labels for train data
y_train_sampled = create_labels(train_tweets_sampled)
# creating classifier using the best kernel
classifier_1 = learn_classifier(X_train_sampled, y_train_sampled, best_kernel)

# getting predictions from SVM classifier
y_pred_sampled = classify_tweets(tfidf_1, classifier_1, test_tweets_sampled[['text']])
# getting labels for test data
y_test_sampled = create_labels(test_tweets_sampled)

# calculating accuracy for the SVM classifier
correct = 0
for label, response in zip(y_test_sampled, y_pred_sampled):
    if label == response:
        correct += 1
accuracy_svm = correct / len(y_pred_sampled)
accuracy_svm

0.9260115606936417

In [48]:
all_predictions = tweets_to_predictions(predict_initial_prompt, test_tweets_sampled["text"].to_list(), model_name=model_name)

In [55]:
modified_predictions = []
changed = 0
half_anomalies_count = sum(1 for value in all_predictions if value not in ['R','D']) / 2
for label in all_predictions:
    if label not in ['R','D'] :
        if changed < half_anomalies_count:
            modified_predictions.append('R')
        else:
            modified_predictions.append('D')
        changed += 1
    else:
        modified_predictions.append(label)

In [56]:
test_tweets_labels = create_labels(test_tweets_sampled)
predictions_map = {'R' : 0, 'D' : 1}
new_predictions = [predictions_map[value] for value in modified_predictions]
accuracy = accuracy_score(test_tweets_labels, new_predictions)
accuracy

0.8601156069364162