In [2]:
import pandas as pd
import numpy as np

In [3]:
df0 = pd.read_csv('data/annual_csi_data_for_match.csv', sep=',')

In [4]:
df0.rename(index=str, columns={"datatime": "datetime"}, inplace=True)
df0['datetime'] = pd.to_datetime(df0['datetime'])
df0.reset_index(drop = True, inplace = True)

In [5]:
df0.head()

Unnamed: 0,datetime,lat,lon,city,actualkwh,irradiancekwh,pr,mount_type,koeppen,capacity_dc_kw,tilt,azimuth,paneltype,cellcategory,cod,invertermake,invertermodel,panelmake,panelmodel
0,2009-01-01,36.817933,-119.759976,Fresno,215457.0875,278138.15643,0.774641,Fixed - Roof Mounted,Moderate,161.12,30.0,180.0,mono,Premium,2008-09-05,SatCon,AE-135-60-PV-A,SunPower,PL-EVER-ES-190P
1,2010-01-01,36.817933,-119.759976,Fresno,193487.66,261959.131597,0.738618,Fixed - Roof Mounted,Moderate,161.12,30.0,180.0,mono,Premium,2008-09-05,SatCon,AE-135-60-PV-A,SunPower,PL-EVER-ES-190P
2,2009-01-01,34.016505,-118.113753,Montebello,288246.65,416118.658276,0.692703,Fixed - Roof Mounted,Moderate,243.2,30.0,180.0,mono,Premium,2007-08-01,Xantrex,PV225S-480-P,SunPower,PL-EVER-ES-190P
3,2010-01-01,34.016505,-118.113753,Montebello,252797.225,409327.120794,0.617592,Fixed - Roof Mounted,Moderate,243.2,30.0,180.0,mono,Premium,2007-08-01,Xantrex,PV225S-480-P,SunPower,PL-EVER-ES-190P
4,2009-01-01,33.782519,-117.228648,Perris,56125.0,81674.96556,0.687175,Fixed - Roof Mounted,Moderate,46.592,30.0,180.0,poly,Standard,2007-10-15,SatCon,AE-50-60-PV-D,Sharp,ND-208U2


In [5]:
# Function to inspect all value counts at once excluding 'lat' & 'lon'
def val_counts(df):
    for i in df.columns:
        if i not in ['lat', 'lon']:
            print(df[f'{i}'].value_counts())

In [6]:
def dt_to_year(df):
    df['year'] = np.zeros(df.shape[0])
    for i, el in df['datetime'].iteritems():
        df.at[i,'year'] = el.year
    df['year'] = df['year'].astype(int)
    df.drop(['datetime'], axis=1, inplace=True)

In [7]:
def pop_year(yr):
    mask = df['year']==yr
    df_new = df[mask]
    return df_new

In [78]:
def clean_data(df):    
    drop_list = ['city', 'cod', 'panelmodel','invertermodel','lon','pr','invertermake','panelmake']
    df.drop(drop_list, axis=1, inplace=True)
    df.paneltype.fillna(value = 'poly', inplace=True)
    df.mount_type.fillna(value = 'Fixed - Roof Mounted', inplace=True)
    df['cellcategory'].replace('Unknown', 'Standard', inplace=True)
    df['azimuth'].replace('Mixed', 180, inplace=True)
    df['azimuth'] = df['azimuth'].astype(float)
    # df['tilt_mixed'] = (df['tilt']=='Mixed')*1
    df['tilt'].replace('Mixed', round(df['lat'],1),inplace=True)
    df['tilt'] = df['tilt'].astype(float)
    
    for i, el in df['paneltype'].iteritems():
        if el not in ['poly','mono']:
            df.at[i,'paneltype']='other'
    return df

In [79]:
df = df0.copy()

In [80]:
df= clean_data(df)

In [81]:
df['lat'] = round(df['lat'],1)

In [82]:
tilt = df['tilt'].values.astype(float)
lat = df['lat'].values.astype(float)
az = df['azimuth'].values
df['opt_norm_tilt'] = 1 - abs(tilt-lat)/lat
df['opt_norm_az'] = 1 - abs(az-180)/180

In [83]:
df.describe()

Unnamed: 0,lat,actualkwh,irradiancekwh,capacity_dc_kw,tilt,azimuth,opt_norm_tilt,opt_norm_az
count,8157.0,8157.0,8157.0,8157.0,8157.0,8157.0,8157.0,8157.0
mean,35.863626,285974.7,304901.5,186.823048,18.001018,183.487066,0.501726,0.909978
std,2.161258,477579.6,514353.7,299.150419,10.136918,33.426124,0.278557,0.16357
min,32.6,796.208,1632.335,1.15,0.0,0.0,-0.142857,0.0
25%,33.8,8416.804,9172.811,5.775,10.0,180.0,0.292398,0.877778
50%,36.3,81872.0,89284.59,56.4,18.0,180.0,0.510204,1.0
75%,37.8,371645.0,400534.5,251.16,23.0,184.0,0.651042,1.0
max,41.8,7867719.0,9247761.0,4725.84,72.0,270.0,1.0,1.0


In [84]:
df = pd.get_dummies(df,columns = ['mount_type','koeppen','paneltype','cellcategory'])

In [85]:
df.head().T

Unnamed: 0,0,1,2,3,4
datetime,2009-01-01 00:00:00,2010-01-01 00:00:00,2009-01-01 00:00:00,2010-01-01 00:00:00,2009-01-01 00:00:00
lat,36.8,36.8,34,34,33.8
actualkwh,215457,193488,288247,252797,56125
irradiancekwh,278138,261959,416119,409327,81675
capacity_dc_kw,161.12,161.12,243.2,243.2,46.592
tilt,30,30,30,30,30
azimuth,180,180,180,180,180
opt_norm_tilt,0.815217,0.815217,0.882353,0.882353,0.887574
opt_norm_az,1,1,1,1,1
mount_type_1-Axis,0,0,0,0,0


In [86]:
df.columns

Index(['datetime', 'lat', 'actualkwh', 'irradiancekwh', 'capacity_dc_kw',
       'tilt', 'azimuth', 'opt_norm_tilt', 'opt_norm_az', 'mount_type_1-Axis',
       'mount_type_Fixed - Roof Mounted', 'koeppen_Arid', 'koeppen_Moderate',
       'paneltype_mono', 'paneltype_other', 'paneltype_poly',
       'cellcategory_Mixed', 'cellcategory_Premium', 'cellcategory_Standard',
       'cellcategory_Thin Film'],
      dtype='object')

In [88]:
cols = ['actualkwh','capacity_dc_kw','opt_norm_tilt','opt_norm_az', 'mount_type_1-Axis',
       'mount_type_Fixed - Roof Mounted', 'koeppen_Arid', 'koeppen_Moderate',
       'paneltype_mono', 'paneltype_other', 'paneltype_poly',
       'cellcategory_Mixed', 'cellcategory_Premium', 'cellcategory_Standard',
       'cellcategory_Thin Film']
df_new = df.loc[:,cols]

In [89]:
df_new.head().T

Unnamed: 0,0,1,2,3,4
actualkwh,215457.0875,193487.66,288246.65,252797.225,56125.0
capacity_dc_kw,161.12,161.12,243.2,243.2,46.592
opt_norm_tilt,0.815217,0.815217,0.882353,0.882353,0.887574
opt_norm_az,1.0,1.0,1.0,1.0,1.0
mount_type_1-Axis,0.0,0.0,0.0,0.0,0.0
mount_type_Fixed - Roof Mounted,1.0,1.0,1.0,1.0,1.0
koeppen_Arid,0.0,0.0,0.0,0.0,0.0
koeppen_Moderate,1.0,1.0,1.0,1.0,1.0
paneltype_mono,1.0,1.0,1.0,1.0,0.0
paneltype_other,0.0,0.0,0.0,0.0,0.0


In [90]:
df_new.corr()

Unnamed: 0,actualkwh,capacity_dc_kw,opt_norm_tilt,opt_norm_az,mount_type_1-Axis,mount_type_Fixed - Roof Mounted,koeppen_Arid,koeppen_Moderate,paneltype_mono,paneltype_other,paneltype_poly,cellcategory_Mixed,cellcategory_Premium,cellcategory_Standard,cellcategory_Thin Film
actualkwh,1.0,0.973386,-0.208641,0.152814,0.44091,-0.44091,-0.067582,0.067582,-0.075434,-0.000155,0.074306,0.020307,-0.076296,0.064678,0.022435
capacity_dc_kw,0.973386,1.0,-0.203314,0.151988,0.36796,-0.36796,-0.071578,0.071578,-0.079075,0.010322,0.073743,0.034817,-0.079964,0.063657,0.029079
opt_norm_tilt,-0.208641,-0.203314,1.0,0.091695,-0.24294,0.24294,-0.010415,0.010415,-0.013149,-0.023171,0.022112,0.033324,-0.012383,0.019145,-0.040794
opt_norm_az,0.152814,0.151988,0.091695,1.0,0.071035,-0.071035,0.034861,-0.034861,-0.004475,0.021211,-0.003991,0.017963,-0.004172,-0.008597,0.030447
mount_type_1-Axis,0.44091,0.36796,-0.24294,0.071035,1.0,-1.0,-0.007621,0.007621,0.005694,-0.001869,-0.004864,-0.012418,0.005477,-0.006886,0.011697
mount_type_Fixed - Roof Mounted,-0.44091,-0.36796,0.24294,-0.071035,-1.0,1.0,0.007621,-0.007621,-0.005694,0.001869,0.004864,0.012418,-0.005477,0.006886,-0.011697
koeppen_Arid,-0.067582,-0.071578,-0.010415,0.034861,-0.007621,0.007621,1.0,-1.0,0.180061,-0.042549,-0.160382,-0.01828,0.17968,-0.163708,-0.033115
koeppen_Moderate,0.067582,0.071578,0.010415,-0.034861,0.007621,-0.007621,-1.0,1.0,-0.180061,0.042549,0.160382,0.01828,-0.17968,0.163708,0.033115
paneltype_mono,-0.075434,-0.079075,-0.013149,-0.004475,0.005694,-0.005694,0.180061,-0.180061,1.0,-0.160906,-0.920554,-0.069128,0.998447,-0.933848,-0.12523
paneltype_other,-0.000155,0.010322,-0.023171,0.021211,-0.001869,0.001869,-0.042549,0.042549,-0.160906,1.0,-0.237403,0.321365,-0.153428,-0.149306,0.778278


In [None]:
X = df.loc[:,['']]

### Dealing with extreme PR values

In [141]:
####  df.sort_values(by =['pr'], ascending=False)