In [1]:
import pandas as pd
import re
import json
import io
import csv
import sys
from pprint import pprint
from collections import defaultdict
from bs4 import UnicodeDammit

In [None]:

#self.day_data = defaultdict(dict)
#self.parse_sample_data()
#checkin_records = parse_checkin_data()

# Data structure for getting city name from a poiid.
# Format: {poiid: citycode, poiid2, citycode2, ...}
poiid_to_city = parse_poiid_data()

# Data structure for number of tweets in city, by day.
# Format: {citycode: {day: tweet_count, day2: tweet_count ...}, citycode2 : {...} ...}
tweets_by_day = parse_tweet_numbers()

# Data structure for air pollution by city, by day.
# Format: {citycode: {day: pollution_stat, day2: pollution_stat2 ...}, citycode2 : {...} ...}
pollution_by_day = parse_pollution_data()

# Data structure for city code to city name.
# Format: {citycode: cityname, citycode2 : cityname2 ...}
code_to_city = get_city_name()

# Data structure for tweet counts by week, by city
# Format: {citycode: {week: tweet_count, week2: tweet_count2, ...}, citycode2: {...} ...}
weekly_tweets = get_weekly_tweets()

# Data structure for tweet counts by week, by city
# Format: {citycode: {2015: tweet_count, 2016: tweet_count}, citycode2: {...} ...}
yearly_tweets = get_yearly_tweets()

In [6]:

# Generate data structure for city code to city name.
# Returns data structure with format: {citycode: cityname, citycode2 : cityname2 ...}
def get_city_name():

    parsed_data = {}

    with open("china_cities.txt") as f:

        for line in f: 

            info = line.split(",")
            code = info[2].rstrip()
            city = info[1].rstrip()
            #parsed_data[code] = city
            parsed_data[code] = city

    return parsed_data




# Generate data structure for number of tweets in city, by day.
# Returns data structure with format: {citycode: {day: tweet_count, day2: tweet_count ...}, citycode2 : {...} ...}
def parse_tweet_numbers():

    parsed_data = defaultdict(dict)
    checkin_records = "raw_data/all_checkinrecords.csv"

    csv.field_size_limit(sys.maxsize)

    with open(checkin_records, 'rU') as datafile:

        reader = csv.reader(datafile)

        COUNTER = 0

        for row in reader:

            data = re.split(r'\t+', row[0])
            date = data[2]
            poiid = data[0]

            try:
                citycode = self.poiid_to_city[poiid]
                if(citycode in parsed_data):
                    if(date in parsed_data[citycode]):
                        parsed_data[citycode][date] += 1
                    else:
                        parsed_data[citycode][date] = 1
                else:
                    parsed_data[citycode] = {date:1}

            except:
                print("City code does not exist.")

            COUNTER += 1

    return parsed_data



# Generate data structure for air pollution by city, by day.
# Returns data structure with format: {citycode: {day: pollution_stat, day2: pollution_stat2 ...}, citycode2 : {...} ...}
def parse_pollution_data():
    parsed_data = defaultdict(dict)
    pollution_data = "raw_data/pollution_data.csv"

    csv.field_size_limit(sys.maxsize)

    with open(pollution_data, 'rU') as datafile:

        reader = csv.reader(datafile)

        for row in reader:

            citycode = row[0]
            date = row[1]
            pollution = row[20]

            if(citycode in parsed_data):
                if(date in parsed_data[citycode]):
                    parsed_data[citycode] = pollution
                else:
                    parsed_data[citycode][date] = pollution
            else:
                parsed_data[citycode] = {date:pollution}

    return parsed_data


def parse_poiid_data():

    parsed_data = defaultdict(dict)
    poiid_locations = "raw_data/poiid_locations.csv"

    csv.field_size_limit(sys.maxsize)

    with open(poiid_locations, 'rU') as datafile:

        reader = csv.reader(datafile)
        for row in reader:
            parsed_data[row[0]] = row[7]

    return parsed_data


def parse_checkin_data():

    parsed_data = defaultdict(dict)
    checkin_records = "raw_data/all_checkinrecords.csv"

    csv.field_size_limit(sys.maxsize)

    with open(checkin_records, 'rU') as datafile:

        reader = csv.reader(datafile)
        for row in reader:

            data = re.split(r'\t+', row[0])
            date = data[2]
            poiid = data[0]

            if(date in parsed_data):
                if(poiid in parsed_data[date]):
                    parsed_data[date][poiid]+=1
                else:
                    parsed_data[date][poiid] = 1
            else:
                parsed_data[date][poiid] = 1

    return parsed_data



def parse_sample_data():

    parsed_data = defaultdict(dict)

    csvfile = "raw_data/all_checkinrecords_sample.csv"

    column_names = ["poiid","userid","date","time"]

    csv.field_size_limit(sys.maxsize)

    with open(csvfile, 'rU') as datafile:

        reader = csv.DictReader(datafile)
        for row in reader:

            if(parsed_data[row['date']]):
                if(parsed_data[row['date']][row['poiid']]):
                    parsed_data[row['date']][row['poiid']]+=1
                else:
                    parsed_data[row['date']][row['poiid']] = 1
            else:
                parsed_data[row['date']][row['poiid']] = 1

#     j = json.dumps(parsed_data, indent=4)
#     f = open('sample.json', 'w')
#     print >> f, j
#     f.close()

    return 0

In [7]:
# Generate data structure for tweet counts by week, by city.
# Returns data structure with format: {citycode: {2015: tweet_count, 2016: tweet_count}, citycode2: {...} ...}
def get_yearly_tweets():

    data = json.load(open('data/media_by_city.json'))
    #pprint(data)
    print(type(data))