In [62]:
import csv
import nltk
import copy
import time
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

## Set Up
# I/O files
reddit_submissions_file = 'data/reddit_submissions_jan2012.txt'
output_file = 'output/text_features.tsv'

# NLP init
sw = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
sid = SentimentIntensityAnalyzer()

# Basic Linguistic Features

We can gather some simple features for our model in a single pass over each title, including:
- proportion of "good" words
- proportion of "bad" words
- proportion of "subreddit-specific" words
- \# words for each part of speech
- sentiment of title
- is title > 16 tokens?
- is title < 4 tokens?

In [63]:
## Helper Functions

def cleanAndStemTitle(title):
    return [str(stemmer.stem(t)) for t in nltk.wordpunct_tokenize(title)
                             if t.isalnum() and t not in sw]

def cleanTitle(title):
    return nltk.wordpunct_tokenize(title)

def getNumTokens(title):
    return len(' '.split(title))

posTypes = ['determiner', 'pronoun', 'noun', 'adjective',
           'adverb', 'interjection', 'preposition']
posPresencesTemplate = {'has_{}'.format(t): 0 for t in posTypes}
# See https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk
posTagToTypeMap = {
    'DT': 'determiner',
    'PDT': 'determiner',
    'WDT': 'determiner',
    'PRP': 'pronoun',
    'PRP$': 'pronoun',
    'WP': 'pronoun',
    'WP$': 'pronoun',
    'NN': 'noun',
    'NNS': 'noun',
    'NNP': 'noun',
    'NNPS': 'noun',
    'JJ': 'adjective',
    'JJR': 'adjective',
    'JJS': 'adjective',
    'RB': 'adverb',
    'RBR': 'adverb',
    'RBS': 'adverb',
    'WRB': 'adverb',
    'UH': 'interjection',
    'IN': 'preposition',
}

def getPosPresenceFeatures(clean_title_tokens):
    posPresences = copy.deepcopy(posPresencesTemplate)
    tagged_tokens = pos_tag(clean_title_tokens)
    for token, tag in tagged_tokens:
        if tag not in posTagToTypeMap:
            continue
        posPresences['has_' + posTagToTypeMap[tag]] = 1
    return posPresences

def getLengthFeatures(raw_title):
    features, num_tokens = {}, len(raw_title.split(' '))
    if num_tokens > 16:
        features['is_long_title'] = 1
    else: 
        features['is_long_title'] = 0
    if num_tokens < 4:
        features['is_short_title'] = 1
    else:
        features['is_short_title'] = 0
    return features

def getSentimentFeature(raw_title):
    features, polarity = {}, sid.polarity_scores(raw_title)['compound']
    if polarity > 0.5:
        features['sentiment'] = 1
    elif polarity < -0.5:
        features['sentiment'] = -1
    else:
        features['sentiment'] = 0
    return features

# Header for CSV file
header = ['post_id', 'sentiment', 'is_long_title', 'is_short_title', 
          'has_determiner', 'has_pronoun', 'has_noun', 'has_adjective',
          'has_adverb', 'has_interjection', 'has_preposition']
# Object map to feature vector
def getFeatureVectorFromFeatures(post_id, features):
    # Normal row
    featureV = []
    featureV.append(post_id)
    featureV.append(features['sentiment'])
    featureV.append(features['is_long_title'])
    featureV.append(features['is_short_title'])
    featureV.append(features['has_determiner'])
    featureV.append(features['has_pronoun'])
    featureV.append(features['has_noun'])
    featureV.append(features['has_adjective'])
    featureV.append(features['has_adverb'])
    featureV.append(features['has_interjection'])
    featureV.append(features['has_preposition'])
    return featureV
    

In [65]:
# Iterate over submissions line by line, (potentially) outputting a
# line in the output_file for each title.
with open(reddit_submissions_file,'rb') as tsvin, open(output_file, 'wb') as csvout:
    tsvin = csv.reader(tsvin, delimiter='\t')
    csvout = csv.writer(csvout)
        
    it = 0
    start = time.time()
    for row in tsvin:
        if it == 0:
            it += 1
            continue
        # if it > 100:
        #     break
        if it > 0 and it % 50000 == 0:
            now = time.time()
            print "%d done, (%f elapsed)" % (it, now - start)

        # Where we'll be storing all the features for the title
        features = {}

        # Pull out the raw title text from csv row
        raw_title = row[-2]
        # Clean up title text
        clean_title_tokens = cleanTitle(raw_title) # For POS tagging
        clean_stemmed_title_tokens = cleanAndStemTitle(raw_title) # Stemmed for good/bad words
        
        # Feature 1: Pos presence features
        features.update(getPosPresenceFeatures(clean_title_tokens)) 
        # Feature 2: Length features
        features.update(getLengthFeatures(raw_title))
        # Feature 3: Sentiment analysis
        features.update(getSentimentFeature(raw_title))

        # Convert features to feature vector
        csvout.writerow(getFeatureVectorFromFeatures(row[0], features))
        
        it += 1

        

50000 done, (82.243857 elapsed)
100000 done, (166.108537 elapsed)
150000 done, (251.480932 elapsed)
200000 done, (339.156273 elapsed)
250000 done, (419.584445 elapsed)
300000 done, (502.564954 elapsed)
350000 done, (584.122533 elapsed)
400000 done, (664.496846 elapsed)
450000 done, (745.924309 elapsed)
500000 done, (827.004177 elapsed)
550000 done, (908.188698 elapsed)
600000 done, (988.220124 elapsed)
650000 done, (1069.092932 elapsed)
700000 done, (1151.403407 elapsed)
750000 done, (1234.733307 elapsed)
800000 done, (1317.488206 elapsed)
850000 done, (1400.332227 elapsed)
900000 done, (1483.803302 elapsed)
950000 done, (1565.460525 elapsed)
1000000 done, (1648.073636 elapsed)
1050000 done, (1729.447011 elapsed)
1100000 done, (1813.645484 elapsed)
1150000 done, (1896.760631 elapsed)
1200000 done, (1980.596823 elapsed)
1250000 done, (2062.273566 elapsed)
1300000 done, (2143.396331 elapsed)
1350000 done, (2225.690177 elapsed)
