# Data Processing

In [8]:
import numpy as np
import pandas as pd
from netCDF4 import Dataset
from sklearn.ensemble import GradientBoostingRegressor
import sys
from scipy import stats
def get_all_data(prefix, loc_lat, loc_lon, sy, ey):
	def search_index(array, value):
		if value < array[0]:
			return 0
		for i in range(0, len(array)-1):
			if array[i] <= value and value < array[i+1]:
				return i
		return i

	def split(arr, size):
		arrs = []
		while len(arr) > size:
			pice = arr[:size]
			arrs.append(pice)
			arr = arr[size:]
			arrs.append(arr)
		return arrs

	def get_data(path, loc_lat, loc_lon):
		dataset = Dataset(path)
		lon = dataset.variables['lon'][:]
		lat = dataset.variables['lat'][:]
		lat_index = search_index(lat, loc_lat)
		lon_index = search_index(lon, loc_lon)
		t = dataset.variables["pr"][:,lat_index, lon_index].tolist();
		t = [3600*x for x in t]
		return split(t, 365)

	res = []
	for i in ["20060101-20401231", "20410101-20751231", "20760101-21001231"]:
		fn = prefix+"/"+"pr_day_CCSM4_rcp85_r1i1p1_"+i+".nc"
		res = res + get_data(fn, loc_lat, loc_lon)
	return res[sy-2006:ey-2006]

def get_monthly_acc(filename):
	data = pd.read_csv(filename, header=0,low_memory=False)
	data['ymstr'] = data.Date.copy().str.slice(6, 10)+data.Date.copy().str.slice(3, 5)
	dic = data['ymstr'].value_counts().to_dict()
	x=list(dic.keys())
	x.sort()
	res=[]
	for i in x:
		res.append(dic[i])
	return res[12:]

def split_to_month(l):
	days_in_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
	res = []
	start = 0
	end = 0
	for i in range(0, 12):
		start += days_in_month[i]
		end += days_in_month[i+1]
		res.append(l[start:end])
	return res


def get_top_prep(prefix, start, end, n_features):
	loc_lat = 51.513917
	loc_lon = 360-0.172638
	a = get_all_data(prefix, loc_lat, loc_lon, start, end)

	month_prep = []
	for y in a:
		month_prep += split_to_month(y)

	month_top_prep = []
	for m in month_prep:
		month_top_prep.append(sorted(m,reverse=True)[:n_features])	
	return month_top_prep

def get_all_prep(prefix, start, end):
	loc_lat = 51.513917
	loc_lon = 360-0.172638
	a = get_all_data(prefix, loc_lat, loc_lon, start, end)

	month_prep = []
	for y in a:
		month_prep += split_to_month(y)
	return month_prep


X = np.array(get_all_prep('/Users/hhu/Downloads/data/', 2006, 2016))
y = np.array(get_monthly_acc('Accidents0515.csv'))
X = np.array([sum(i) for i in X])
xi = range(0, 120)

# Plotting the 1st plot

In [19]:
import matplotlib.pyplot as plt
plt.plot(range(0, 120), X)
plt.xticks(np.arange(0, 121, 12), [str(i) for i in range(2006,2017)])
slope, intercept, r_value, p_value, std_err = stats.linregress(np.arange(0, 120), X)
line = slope*xi+intercept
plt.plot(xi,line,'--',label='Interpolated trend, slope = '+str(round(slope,4)))
plt.legend(loc='upper right', fancybox=True, framealpha=0.3,ncol = 1)
plt.xlabel('Year')
plt.ylabel('Monthly rainfall amount (mm)')
plt.title('Monthly rainfall amount in the historical period 2006 - 2016 in the UK')

plt.figure()
plt.plot(range(0, 120), y)

slope, intercept, r_value, p_value, std_err = stats.linregress(np.arange(0, 120), y)
line = slope*xi+intercept
plt.plot(xi,line,'--',label='Interpolated trend, slope = '+str(round(slope,2)))
plt.legend()
plt.xticks(np.arange(0, 121, 12), [str(i) for i in range(2006,2017)])
plt.xlabel('Year')
plt.ylabel('Monthly number of car accident')
plt.title('Monthly number of car accidents from 2006 to 2016 in the UK')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Text(0.5,1,'Monthly number of car accidents from 2006 to 2016 in the UK')