In [2]:
import praw
import nltk
import pymongo
import time
import json
import math
from collections import defaultdict
from operator import itemgetter

In [3]:
class AccessDB():
    def __init__(self):
        self.client = pymongo.MongoClient()

    def useDB(self,dbname):
        self.db = self.client[dbname]

    def createCollection(self,name):
        try:
            self.db.create_collection(name)
            return True
        except:
            return False
    
    def insert(self,collection,query):
        '''
        para query :: dict
        '''
        #query = json.loads(query)
        self.db[collection].insert_one(query)
    
    def update(self,collection,target,query):
        
        self.db[collection].update_one(target,query)
    
    def find(self, query = None, projection = None, collection = None):
    
        if collection:
            documents = [i for i in self.db[collection].find(query, projection)]
            return documents
        
        else:
            for i in self.db.collection_names():
                document = self.db[i].find(query, projection)
                
                for i in document:
                    documents.append(i)
            
            return documents

In [4]:
class controlData():
    def __init__(self,db):
        '''
        parameter db is instance of AccessDB() 
        '''
        self.DB = db
    
    def inputPosts(self, db, subreddit, posts):
        
        result = False
        
        if type(posts) is list:
            for post in posts:
                result = self.inputPost(db, subreddit, post)
                
            return result
            
        elif type(posts) is dict:
            result = self.inputPost(db, subreddit, posts)
            return result
            
        else:
            return result
            
            
            
    def inputPost(self, db, subreddit, post):
        self.DB.useDB(db)
        
        if post.has_key('id'):
            id_query = {"ID" : post['id']}
        elif post.has_key('ID'):
            id_query = {"ID" : post['ID']}
        else:
            print "has not key(id)"
            return False
        
        if not self.DB.find(query = id_query, collection = subreddit):
            self.DB.insert(subreddit,id_query)
        
        for key in post.keys():
            if key is 'id' or key is 'ID':
                continue
            
            update_query = {'$set' : {key : post[key]}}
            
            self.DB.update(subreddit,id_query,update_query)
            
        return True
    
    def findPost(self, db, subreddit = None, query = None):
        self.DB.useDB(db)
        
        return self.DB.find(query = query, collection = subreddit)

In [5]:
class Reddit():
    def __init__(self, db = None):
        '''
        parameter db is instance of AccessDB() 
        '''
        
        client = open('client_key')
        
        self.reddit = praw.Reddit(client_id = client.readline(),
                             client_secret = client.readline(),
                             user_agent = 'my user agent')
        
        self.subreddits = ['programming',
                           'gaming',
                           'movies',
                           'hacking',
                           'worldnews',
                           'futurology',
                           'sports',
                           'news',
                           'music',
                           'science',
                           'technology',
                           'politics',
                           'nba',
                           'soccer',
                           'design'
                          ]
        if db:
            self.db = controlData(db)
            self.dbname = 'reddit'
        
    def split_date(self,date):
        date = date.split(":")
            
        '''start_time = date[0] + " 00:00"
        end_time = date[1]+" 00:00"
            
        start_time = time.mktime(time.strptime(start_time, '%Y%m%d %H:%M'))
        end_time = time.mktime(time.strptime(end_time, '%Y%m%d %H:%M'))
        '''
        
        start_time = str2stamp(date[0])
        end_time = str2stamp(date[1])
        
        end_time = end_time + 86400
        
        return start_time,end_time
    
        
    def request2dbinsert(self, date, subreddit = None):
        '''
        date를 1일 단위로 쪼개서 request 후 db에 넣음
        '''
        start_time,end_time = self.split_date(date)
        
        
        s = start_time
        while(s < end_time):
            e = s + 86400
                
            request_time = stamp2str(s) + ":" + stamp2str(e)
            
            print "-" * 10 + request_time + " request " + "-" * 10
            
            self.subreddits_request(request_subreddit = subreddit, date = request_time, parse = True, db = True)
                
            print "-" * 10 + request_time + " complete " + "-" * 10
            
            s += 86400
    
    def subreddits_request(self, request_subreddit = None, date=None, parse = False, db = False):
        '''
        self.subreddits에 있는 서브렛딧들을 한번에 요청
        parse : parse 까지 실행
        db : db insert까지 실행 (parse가 true 일때만)
        '''
        submissions = {}
        
        if request_subreddit:
            if type(request_subreddit) is list:
                subreddits = request_subreddit
            else:
                subreddits = [request_subreddit]
                
        else:
            subreddits = self.subreddits
            
            
        for subreddit in subreddits:
            print subreddit + " request"
            submissions[subreddit] = self.list_request(subreddit,date=date)
            
            if parse:
                submissions[subreddit] = self.parse(submissions[subreddit])
                
                print subreddit + " parse"
                
                if db:
                    self.db.inputPosts(self.dbname,subreddit,submissions[subreddit])
                    
                    print subreddit + " insert to db "
                    
                    submissions[subreddit] = True
        
        return submissions
            
        
    def list_request(self,subreddit,num=None,date=None):
        
        if date:
            start_time,end_time = self.split_date(date)
            
            print start_time
            print end_time
            
            query="timestamp:"+str(int(start_time))+".."+str(int(end_time))
            
            
            result = self.reddit.subreddit(subreddit).submissions(start_time, end_time)
             
            if not result:
                raise last_exception
        
        else:
            result = self.reddit.subreddit(subreddit).new(limit=num)
            
        return result
        
    
    def parse(self,reddit_list):
        posts = []
        
        time_start = int(time.time())
        timeout=900
        
        while int(time.time()) < time_start + timeout:
            try:

                for submission in  reddit_list:
                    comments = []
                    post = {}

                    post['id'] = submission.id
                    post['title'] = submission.title
                    post['date'] = submission.created
                    post['vote'] = submission.score
                    post['content'] = submission.selftext
                    post['url'] = submission.url

                    for temp_comment in list(submission.comments):

                        comment = {}

                        if not hasattr(temp_comment, 'body'):
                            continue

                        comment['id'] = temp_comment.id
                        comment['content'] = temp_comment.body
                        comment['date'] = temp_comment.created
                        comment['vote'] = temp_comment.score

                        comments.append(comment)

                    post['comments'] = comments

                    posts.append(post)
                

            except:
                #wait for 30 seconds since sending more requests to overloaded server might not be helping
                #last_exception = e
                print "*" * 50
                print "error! waiting 30sec"
                print "*" * 50
                time.sleep(30)
            else:
                break
        
        return posts

In [6]:
class Analyse():
    '''def __init__(self,db):
        self.DB = controlData(db)
        
    def get_posts(self,subreddit,date=None):
        dbname = 'reddit'
        posts = self.DB.findPost(dbname, collection = subreddit)
        return posts
        '''
        
        
    def tokenize(self, text):
        try:
            tokens = nltk.word_tokenize(text)
        except:
            print text
        tagged_tokens = nltk.pos_tag(tokens)

        #parser_en = nltk.RegexpParser("NP: {<DT>?<JJ>?<NN.*>*}")
        #chunks_en = parser_en.parse(tagged_tokens)
        #chunks_en.draw()

        return tagged_tokens
        
    def posts_analyze(self, posts):
        if type(posts) is list:
            documents = []
            
            for post in posts:
                documents.append(self.posts_analyze(post))
            
            return documents
            
        if type(posts) is dict:
            document = {}
            
            document['ID'] = posts['ID']
            title_analyse = self.tokenize(posts['title'])

            document.update(self.choose_noun(title_analyse))

            if posts['content']:
                content_analyse = self.tokenize(posts['content'])

                document.update(self.choose_noun(content_analyse))

            if posts['comments']:
                document['COMMENTS'] = []
                
                for comment in posts['comments']:
                    comment_analyse = self.tokenize(comment['content'])
        
                        
                    comment_noun = self.choose_noun(comment_analyse)
                    comment_noun['ID'] = comment['id']

                    document['COMMENTS'].append(comment_noun)
            
            return document
        
        
        
    def choose_noun(self,tagged_token):
        noun_dict = {}
        except_noun = [".", "[", "]", ">", "<", "/*", "*/", "*", "+", "-", "=", "%"]
        
        for i in range(len(tagged_token)):
            if "NN" in tagged_token[i][1]:
                
                noun = tagged_token[i][0].lower()
                
                if noun in except_noun or "." in noun:
                    continue
                
                if noun in noun_dict.keys():
                    noun_dict[noun] += 1
                    
                else:
                    noun_dict[noun] = 1
                    
        if type(noun_dict) is int:
            print tagged_token
        return noun_dict
        
        
    def token_input(self,tagged_token):  #tagged-token : nltk로 pos_tag한 token
        input_word=set()
        
        for i in range(len(tagged_token)):
            if "NN" in tagged_token[i][1]:
                input_word.add(tagged_token[i][0].lower())
            
        for word in input_word:
            if self.word_dict.has_key(word):
                self.word_dict[word] += 1
            else:
                self.word_dict[word] = 1

In [7]:
def str2stamp(date):
    date += " 00:00"
    return time.mktime(time.strptime(date, '%Y%m%d %H:%M'))

In [8]:
def stamp2str(timestamp):
    return time.strftime("%Y%m%d", time.localtime(int(timestamp)))

In [9]:
def start():
    db=AccessDB()
    reddit = Reddit(db)
    reddit.request2dbinsert("20170101:20170228")
    ana = Analyse()
    posts = reddit.db.findPost('reddit',subreddit='futurology')
    nouns = ana.posts_analyze(posts)
    reddit.db.inputPosts('noun','futurology',nouns)
    print " complete"

In [10]:
def make_id_list(date, end_date = None):
    '''
    하루 이상의 데이터를 이용할 경우 끝나는 날을 end_date 입력
    make_id_list("20170101", end_date = "20170108")
    Reddit.subreddits 에 있는 모든 서브레딧에서 부터 해당하는 날짜의 명사들을 가져옴
    
    '''
    day = 86400
    start_time = str2stamp(date)
    
    if end_date:
        end_time = str2stamp(end_date) + day
    else:
        end_time = start_time + day
    
    query = {"$and" : [ 
        { "date" : { "$gte" : start_time}}, 
        { "date" : { "$lt" : end_time}}
    ]}
    
    projection = {"_id" : 0, "ID" : 1}
    
    reddit = AccessDB()
    reddit.useDB("reddit")
    
    noun = AccessDB()
    noun.useDB("noun")
    
    subreddits = Reddit().subreddits
    
    id_list = {}
    
    for subreddit in subreddits:
        
        id_list[subreddit] = reddit.find(collection = subreddit, query = query, projection = projection)
        
        for ID in id_list[subreddit]:
            id_query = {"ID":ID["ID"]}
            noun_list = noun.find(collection = subreddit, query = id_query)
            
            for nouns in noun_list:
                ID.update(nouns)
       
    return id_list

In [11]:
def df(documents):
    """
    ID, _id, COMMENTS, 명사명단이 포함된
    서브레딧하나만 받아야함
    """
    df = {}
    
    exception = ["COMMENTS", "ID", "_id"]
    
    for submission in documents:
        df[submission["ID"]] = set()


        df[submission["ID"]] = set([ i for i in submission if i not in exception])
        
        if "COMMENTS" not in submission:
            continue

        for comment in submission["COMMENTS"]:
            df[submission["ID"]] = df[submission["ID"]].union(set(
                [ i for i in comment if i not in exception]))
            
    return df

In [12]:
def tf(documents):
    """
    ID, _id, COMMENTS, 명사명단이 포함된
    서브레딧하나만 받아야함
    """
    exception = ["COMMENTS", "ID", "_id"]
    
    tf = {}
    tf = defaultdict(lambda:0, tf)
    
    for submission in documents:
        frequency = {}
        frequency = defaultdict(lambda:0, frequency)
        total = 0
        
        for noun_name in submission.keys():
            if noun_name in exception:
                continue
            frequency[noun_name] += float(submission[noun_name])
            total += submission[noun_name]
            
        if "COMMENTS" not in submission:
            continue
            
        for comments_list in submission["COMMENTS"]:
            for noun_name in comments_list.keys():
                if noun_name in exception:
                    continue
                    
                frequency[noun_name] += float(comments_list[noun_name])
                total += comments_list[noun_name]
    
        for noun_name in frequency.keys():
            tf[noun_name] += frequency[noun_name] / total
            
    #sorted_tf = sorted(tf.items(), key=itemgetter(1), reverse = True) 
    
    return tf

In [13]:
def score(for_tf, for_df):
    all_df = {}
    for subreddit in for_tf.keys():
        all_df.update(df(for_tf[subreddit]))

    for subreddit in for_df.keys():
        all_df.update(df(for_df[subreddit]))
        
    noun_df = {}
    noun_df = defaultdict(lambda:0, noun_df)
    df_total = 0

    for sub_id in all_df.keys():
        for keyword in all_df[sub_id]:
            noun_df[keyword] += 1
        df_total += 1
        
    all_tf={}
    
    for i in for_tf.keys():
        all_tf[i] = tf(for_tf[i])
        
    tf_idf = {}
    for subreddit in all_tf.keys():
        tf_idf[subreddit] = {}
        for keyword in all_tf[subreddit]:
            tf_idf[subreddit][keyword] = all_tf[subreddit][keyword] * math.log10(float(df_total) / noun_df[keyword])
            
    #sorted_tf_idf = {}
    
    #sorted_tf_idf['programming'] = sorted(tf_idf['programming'].items(), key=itemgetter(1), reverse = True) 
    
    return tf_idf

In [23]:
def date2list(start_date, end_date, pre_day = 0):
    result = []
    
    day = 86400
    
    stamp_start_date = str2stamp(start_date)
    stamp_end_date = str2stamp(end_date)
    
    date = stamp_start_date - pre_day*day
    
    result = [stamp2str(date)]
    
    while(date < stamp_end_date):
        date += day
        
        result.append(stamp2str(date))
        
    return result

In [15]:
def test_insert_tf_idf(start_date, end_date):
    scoreDB = AccessDB()
    scoreDB.useDB('score')
    
    stamp_start_date = str2stamp(start_date)
    stamp_end_date = str2stamp(end_date)
    
    date = stamp_start_date
    
    day = 86400
    
    while(date <= stamp_end_date):
        x_week_date = stamp2str(date - day*7)
        yesterday = stamp2str(date - day)
        
        today = stamp2str(date)
        
        
        today_list = make_id_list(today)
        x_week_list = make_id_list(x_week_date, end_date = yesterday)
        
        tf_idf = score(today_list, x_week_list)
        
        scoreDB.insert("d"+today, tf_idf)
            
        date += day

In [107]:
def test_trend_score(date):
    scoreDB = AccessDB()
    scoreDB.useDB('score')
    
    documents = scoreDB.find(collection = "d" + date)[0]
        
        
    x_week = date2list(date, date, pre_day = 7)
    x_week.remove(date)
        
    x_week_document = [ scoreDB.find(collection = "d" + i)[0] for i in x_week]
        
    documents.pop("_id")
        
    keyword_score = {}
        
    for subreddit in documents.keys():
        keyword_score[subreddit] = {}
        for keyword in documents[subreddit].keys():
            score_sum = 0
            for i in range(len(x_week_document)):
                if x_week_document[i][subreddit].get(keyword):
                    score_sum += x_week_document[i][subreddit][keyword]
                        
            keyword_score[subreddit][keyword] = documents[subreddit][keyword] - score_sum/len(x_week_document)
                
                
    return keyword_score

In [108]:
if __name__=="__main__":
    #test_insert_score("20170108", "20170227")
    
    #start_date = "20170115"
    start_date = "20170227"
    end_date = "20170227"
    
    
    today = make_id_list("20170227",end_date = "20170227")
    x_week = make_id_list("20170220",end_date = "20170226")

In [109]:
sss = Reddit().subreddits

    tf_idf_result = {}
    tf_idf_result = score(today,x_week)

In [110]:
sorted_tf_idf ={}
    for i in sss:
        sorted_tf_idf[i] = sorted(tf_idf_result[i].items(), key=itemgetter(1), reverse = True)

In [111]:
tf_result = {}
    for i in sss:
        tf_result[i] = tf(today[i])

In [112]:
sorted_tf ={}
    for i in sss:
        sorted_tf[i] = sorted(tf_result[i].items(), key=itemgetter(1), reverse = True)

In [117]:
trend_score = test_trend_score("20170227")
    sorted_trend = {}
    for i in sss:
        sorted_trend[i] = sorted(trend_score[i].items(), key=itemgetter(1), reverse = True)

In [125]:
for i in sss:
        print "*"*40 + i + "*"*40
        print (" "*9 + "[" + "tf" + "]").ljust(30),
        print (" "*7 + "[" + "tf-idf" + "]").ljust(30),
        print (" "*5 + "[" + "trend_score" + "]")
        for j in range(30):
            print (str(j+1) + ":" + sorted_tf[i][j][0] + str(sorted_tf[i][j][1])).ljust(30),
            print (str(j+1) + ":" + sorted_tf_idf[i][j][0] + ":" + str(sorted_tf_idf[i][j][1])).ljust(30),
            print str(j+1) + ":" + sorted_trend[i][j][0] + ":" + str(sorted_trend[i][j][1])
        print

****************************************programming****************************************
         [tf]                         [tf-idf]                     [trend_score]
1:history0.557710420654        1:cobol:0.767344976576         1:cobol:0.732115503971
2:time0.31985468264            2:history:0.717947803957       2:history:0.666621552419
3:code0.259571034482           3:middlewares:0.616263733943   3:middlewares:0.616263733943
4:something0.257827819177      4:webserver:0.616263733943     4:webserver:0.616263733943
5:music0.222222222222          5:dynamo:0.593947144111        5:dynamo:0.593947144111
6:monitor0.222222222222        6:monitor:0.570146246799       6:golang:0.563729685958
7:language0.210775607314       7:golang:0.566116347575        7:monitor:0.524779811975
8:origin0.2                    8:csv:0.516410876181           8:csv:0.490531044555
9:fiction0.2                   9:code:0.491599087496          9:origin:0.487756974842
10:cobol0.2                    10:origin:0.4877