In [37]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.cross_validation import train_test_split

from pprint import pprint

In [4]:
#load weather data
from weather_collection import WeatherDatum
weather = pickle.load(open('LargeDataStorage/weatherDataFile', 'rb'))

#load hubway data
hubway = pickle.load(open('LargeDataStorage/hubwayDataFile', 'rb'))

In [5]:
def count_riders(year, month, day, hour):
	"""
	Input: year, month, day, hour
	Output: total riders during that hour
	"""

	#initialize counter
	counter = 0

	#counts riders during a given hour
	for minute in range(0,60):
		#-1 means that there is no data for that time, so we don't count that
		if hubway.data[year][month][day+1][hour][minute] == -1:
			pass
		else:
			counter += len(hubway.data[year][month][day+1][hour][minute])
	return counter

In [7]:
def process_data(year):
	"""
	Returns 2 lists, 1 of temperatures, 1 of associated ridership.
	"""

	#determines whether or not it is a leap year
	if year % 4 == 0:
		numDaysInMonth = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
	else:
		numDaysInMonth = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

	riders_vs_temp = {}

	#adds all hourly temperatures in given year to dictionary as keys with values of 0
	for m in range(1,13):
		for d in range(numDaysInMonth[m-1]):
			for h in range(0,24):
				if int(float(weather.data[year][m][d+1][h]['tempi'])) < -100:
					pass
				else:
					riders_vs_temp[int(float(weather.data[year][m][d+1][h]['tempi']))] = 0

	#adds number of riders to associated temperature in the dictionary
	for month in range(1,13):
		for day in range(numDaysInMonth[month-1]):
			for hour in range(24):
				if int(float(weather.data[year][month][day+1][hour]['tempi'])) < -100:
					pass
				else:
					riders_vs_temp[int(float(weather.data[year][month][day+1][hour]['tempi']))] += count_riders(year, month, day, hour)

	return riders_vs_temp.keys(), riders_vs_temp.values()

In [46]:
def lin_reg():

    temperatures, ridership = process_data(2013)

    # print "Raw Data"
    # print(temperatures)
    # print(ridership)
    # print temperatures[:,np.newaxis]
    # pprint(np.transpose(temperatures))

    # X_train, X_test, y_train, y_test = train_test_split(temperatures, ridership, train_size=0.5)

    # print "train_test_split output"
    # pprint(X_train)
    # pprint(X_test)
    # pprint(y_train)
    # pprint(y_test)
  
    temps = np.array(temperatures)
    rides = np.array(ridership)
    
    temps = np.log(temps)
    rides = np.log(rides)
    
    #removes infinity from log and sets to 0
    for i in range(len(rides)):
        if rides[i] == -np.inf:
            rides[i] = 0
    
#     temps = np.multiply(temps, temps)
    
    temps = temps.reshape(temps.shape[0], -1)
    rides = rides.reshape(rides.shape[0], -1)
        
#     print "reshaped things"
#     pprint(temps)
#     pprint(rides)
    
    X_train, X_test, y_train, y_test = train_test_split(temps, rides, train_size=0.5)

    model = LinearRegression()
    model.fit(X_train, y_train)
    print "Train R2 %f"%model.score(X_train, y_train)
    print "Test R2 %f"%model.score(X_test, y_test)

lin_reg()

Train R2 0.702792
Test R2 0.717079




In [52]:
temperatures, ridership = process_data(2013)

plt.scatter(np.log(temperatures), np.log(ridership))
plt.show()

  app.launch_new_instance()


In [53]:
temperatures, ridership = process_data(2013)

plt.scatter(np.log10(temperatures), np.log10(ridership))
plt.show()

  app.launch_new_instance()


In [54]:
temperatures, ridership = process_data(2013)

plt.scatter(temperatures, ridership)
plt.show()

In [55]:
temperatures, ridership = process_data(2013)

plt.plot(np.square(temperatures), np.square(ridership))
plt.show()