In [1]:
import pandas as pd
import glob
from difflib import SequenceMatcher
import re
import nltk
from nltk.tokenize import word_tokenize
import time

In [2]:
df = pd.read_csv("filtered_tweets.csv", lineterminator='\n')

## Helper functions

In [3]:
def foundPronouns(tweet):
    pronouns = ['i', 'me', 'my', 'we', 'us', 'our']
    regex = re.compile('[^a-zA-Z]')
    
    #remove US and U.S.
    mod_tweet = tweet.replace("US", "")
    mod_tweet = mod_tweet.replace("U.S.", "")
    
    mod_tweet = mod_tweet.lower().replace("'", " ").split()
    for word in mod_tweet:
        re_word = regex.sub('', word)
        if re_word in pronouns:
            return True
    
    return False

def POS(tweets, tags):    
    tweets_tags = []
    
    for i in range(len(tweets)):
        pos = nltk.pos_tag(word_tokenize(tweets[i]))
        first_word_tag = pos[0][1]
        
        if first_word_tag in tags:
            tweets_tags.append(tweets[i])
        
    return tweets_tags

## Find pronouns

In [4]:
tweets = list(df['filtered_tweets'])
tweets_pronouns = []
for tweet in tweets:
    if foundPronouns(tweet):
        tweets_pronouns.append(tweet)

In [5]:
df_pronouns = pd.DataFrame({"tweets_pronouns": tweets_pronouns})
df_pronouns.to_csv("tweets_pronouns.csv")
df_pronouns.sample(100).to_csv("samples/tweets_pronouns.csv")

In [6]:
print(len(tweets_pronouns))

33533


## Tweets begin with a verb

In [7]:
tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
tweets_tags = {}
for tag in tags:
    print("Tag: {}".format(tag))
    tweets_tags["tweets_{}".format(tag)] = POS(tweets, tag)
    print("Length of tweets_{}: {}\n".format(tag, len(tweets_tags["tweets_{}".format(tag)])))

Tag: VB
Length of tweets_VB: 4122

Tag: VBD
Length of tweets_VBD: 4299

Tag: VBG
Length of tweets_VBG: 11415

Tag: VBN
Length of tweets_VBN: 6589

Tag: VBP
Length of tweets_VBP: 4781

Tag: VBZ
Length of tweets_VBZ: 5185



In [8]:
for i in tweets_tags:
    temp = pd.DataFrame({i: tweets_tags[i]})
    temp.to_csv("{}.csv".format(i))
    try:
        temp.sample(100).to_csv("samples/{}.csv".format(i))
    except:
        temp.to_csv("samples/{}.csv".format(i))
    

In [9]:
s = 0
for i in tweets_tags:
    print(i, len(tweets_tags[i]))
    s += len(tweets_tags[i])
s

tweets_VB 4122
tweets_VBD 4299
tweets_VBG 11415
tweets_VBN 6589
tweets_VBP 4781
tweets_VBZ 5185


36391

__NOTES (conclusions)__
1. Removed US and U.S. from all tweets before finding pronouns
2. Replaced apostrophe(') with space so I can detect pronouns in situations like: I'm, we're, etc.
3. Found tweets that begin with a verb. There are 6 tags for verbs in nltk. Some words can have multiple tags.