In [None]:
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
from dateutil import parser
import pickle
import re
import random

In [None]:
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Pull tweet data from database

In [None]:
dbname = '***'
username = '***'

In [None]:
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

In [None]:
con = None
con = psycopg2.connect(database = dbname, user = username)

sql_query = """
SELECT * FROM raw_tweet_table;
"""
df = pd.read_sql_query(sql_query,con)
#df is a dataframe with columns 'created_at','text' and 'hashtags'

# Take dataframe of those with exactly one hashtag

In [None]:
#currently hashtags are 1 string separated with a space
#split into lists
df['hashtags']=df['hashtags'].str.split()

In [None]:
df = df[[df['hashtags'].map(len)==1]]

# Processing

In [None]:
#negate words between negative word and next punctuation by appending _neg
def negation_process(tweet):
    #add final period to ensure negation if no final punctuation
    tweet = tweet + '.'
    tweet = re.sub(r'\b(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)\b[\w\s]+[^\w\s]', 
       lambda match: re.sub(r'(\s+)(\w+)', r'\1neg_\2', match.group(0)), tweet,flags=re.IGNORECASE)
    #return tweet[:-1] to remove added period
    return tweet[:-1]
    

In [None]:
#Porter stemming
def stemming(tweet):
    temp = []
    for word in tweet.split():
        temp.append(stemmer.stem(word.lower()))
    return ' '.join(temp)

In [None]:
#primary text processing
def process_text(tweet_list):
    processed_tweets = []
    for tweet in tweet_list:
        tweet = re.sub(r"(?:\@|https?\://|#)\S+", "", tweet)
        tweet = tweet.replace('\'','')
        #negate
        tweet = negation_process(tweet)
        #replace non ascii characters
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
        tweet = tweet.replace('RT','')
        tweet = tweet.replace(':','')
        tweet = tweet.replace('+',' ')
        tweet = tweet.replace(',','')
        tweet = tweet.replace('.','')
        tweet = tweet.replace('\"','')
        #remove duplicate consecutive characters for standardization
        tweet = re.sub(r'(\S)\1+', r'\1', tweet)
        #add spaces before emotive punctuation, useful for bigrams
        tweet = tweet.replace('!',' !')
        tweet = tweet.replace('?',' ?')
        tweet = tweet.strip()
        tweet = stemming(tweet)
        processed_tweets.append(tweet)
    return processed_tweets

In [None]:
#process twitter text
processed_tweets = process_text(df['text'].tolist())
df['processed_text'] = processed_tweets

In [None]:
#drop duplicates after processing
#processing standardizes so drop full amount of duplicates
df = df.drop_duplicates(inplace=False, subset='processed_text')

# Export data

In [None]:
#export dataframe with processed tweet data
f = open('df_processed_single_hashtag.pickle', 'wb')
pickle.dump(df[['created_at','processed_text','hashtags']], f)
f.close()