In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
import os
destdir = '../input/'
files = [ f for f in os.listdir(destdir) if os.path.isfile(os.path.join(destdir,f)) ]

In [None]:
files

In [None]:
#df2014 = pd.read_csv('../input/Parking_Violations_Issued_-_Fiscal_Year_2014.csv', nrows = 100000)
#df2015 = pd.read_csv('../input/Parking_Violations_Issued_-_Fiscal_Year_2015.csv', nrows = 100000)
#df2016 = pd.read_csv('../input/Parking_Violations_Issued_-_Fiscal_Year_2016.csv', nrows = 100000)
df2017 = pd.read_csv('../input/Parking_Violations_Issued_-_Fiscal_Year_2017.csv', nrows = 100000)
#df2018 = pd.read_csv('../input/Parking_Violations_Issued_-_Fiscal_Year_2018.csv', nrows = 100000)

In [None]:
df2017.head(n=5)

In [None]:
df2017.shape

In [None]:
d = {'Nan Entry': df2017.isnull().any(),
    'Unique Entry': df2017.nunique(axis = 0)}
pd.DataFrame(data = d, index = df2017.columns.values)

In [None]:
drop_column = ['Vehicle Color','Plate ID','Issuing Agency','Street Code1','Street Code2','Street Code3','Vehicle Expiration Date',
               'Violation Location','Violation Precinct','Issuer Precinct','Issuer Code','Issuer Command','Issuer Squad','Time First Observed',
              'Violation In Front Of Or Opposite','House Number','Street Name','Intersecting Street','Date First Observed','Law Section',
              'Sub Division','Violation Legal Code','From Hours In Effect','To Hours In Effect','Meter Number','Unregistered Vehicle?','Feet From Curb',
              'Hydrant Violation','Double Parking Violation','Violation Post Code','No Standing or Stopping Violation','Summons Number']
df2017.drop(drop_column, axis = 1, inplace = True)

In [None]:
df2017.shape

In [None]:
mini2017 = df2017.sample(frac = 0.1, replace = False)

In [None]:
mini2017.shape

## Barplot of 'Registration State'

In [None]:
x_ticks = mini2017['Registration State'].value_counts().index
heights = mini2017['Registration State'].value_counts()
y_pos = np.arange(len(x_ticks))
fig = plt.figure(figsize=(10,20)) 
# Create horizontal bars
plt.barh(y_pos, heights)
 
# Create names on the y-axis
plt.yticks(y_pos, x_ticks)
 
# Show graphic
plt.show()


## How the number of tickets given changes with each month?
* ## Which months have most of the parking tickets been given?

In [None]:
month = []
for time_stamp in pd.to_datetime(mini2017['Issue Date']):
    month.append(time_stamp.month)
m_count = pd.Series(month).value_counts()

plt.figure(figsize=(12,8))
sns.barplot(y=m_count.values, x=m_count.index, alpha=0.6)
plt.title("Number of Parking Ticket Given Each Month", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.ylabel("No. of cars", fontsize=16)
plt.show();

From the barplot above **May** and **October** have the highest number of tickets, and **July** and **December** have the lowest number of tickets given.

## How many parking tickets are given for each violation code?

In [None]:
violation_code = mini2017['Violation Code'].value_counts()

plt.figure(figsize=(16,8))
f = sns.barplot(y=violation_code.values, x=violation_code.index, alpha=0.6)
#plt.xticks(np.arange(0,101, 10.0))
f.set(xticks=np.arange(0,100, 5.0))
plt.title("Number of Parking Tickets Given for Each Violation Code", fontsize=16)
plt.xlabel("Violation Code [ X5 ]", fontsize=16)
plt.ylabel("No. of cars", fontsize=16)
plt.show();

Most of the parking tickets are given for violation code 7,9,5. Violation code 7 stands for **failure to stop at red light**. Violation code 9 stands for **obstructing traffic/intersect**. Violation Code 5 stands for **bus lane violation**. 

## How many parking tickets are given for each body type?

In [None]:
x_ticks = mini2017['Vehicle Body Type'].value_counts().index
heights = mini2017['Vehicle Body Type'].value_counts().values
y_pos = np.arange(len(x_ticks))
fig = plt.figure(figsize=(20,4))
f = sns.barplot(y=heights, x=y_pos, orient = 'v', alpha=0.6);
# remove labels
plt.tick_params(labelbottom='off')
plt.ylabel('No. of cars', fontsize=16);
plt.xlabel('Car models [Label turned off due to crowding. Too many types.]', fontsize=16);
plt.title('Parking ticket given for different type of car body', fontsize=16);


In [None]:
df_bodytype = pd.DataFrame(mini2017['Vehicle Body Type'].value_counts() / len(mini2017)).nlargest(10, columns = ['Vehicle Body Type'])

Top 10 car body types that get the most parking tickets are listed below : 

In [None]:
df_bodytype

In [None]:
From the chart above, the top 3 vihicle body types that have been given the most tickets are Suburban, Four-door sedan, and Van.

## How many parking tickets are given for each vehicle make?

Just for the sake of changing the flavor of visualization this time I will make a logplot of car no. vs make. In that case we will be able to see much smaller values in the same graph with larger values.

In [None]:
vehicle_make = mini2017['Vehicle Make'].value_counts()

plt.figure(figsize=(16,8))
f = sns.barplot(y=np.log(vehicle_make.values), x=vehicle_make.index, alpha=0.6)
# remove labels
plt.tick_params(labelbottom='off')
plt.ylabel('log(No. of cars)', fontsize=16);
plt.xlabel('Car make [Label turned off due to crowding. Too many companies!]', fontsize=16);
plt.title('Parking ticket given for different type of car make', fontsize=16);

plt.show();

In [None]:
pd.DataFrame(mini2017['Vehicle Make'].value_counts() / len(mini2017)).nlargest(10, columns = ['Vehicle Make'])

## How the number of tickets given throughout the day?

In [None]:
timestamp = []
for time in mini2017['Violation Time']:
    if len(str(time)) == 5:
        time = time[:2] + ':' + time[2:]
        timestamp.append(pd.to_datetime(time, errors='coerce'))
    else:
        timestamp.append(pd.NaT)
    

mini2017 = mini2017.assign(Violation_Time2 = timestamp)
mini2017.drop(['Violation Time'], axis = 1, inplace = True)
mini2017.rename(index=str, columns={"Violation_Time2": "Violation Time"}, inplace = True)

In [None]:
hours = [lambda x: x.hour, mini2017['Violation Time']]

In [None]:
# Getting the histogram
mini2017.set_index('Violation Time', drop=False, inplace=True)
plt.figure(figsize=(16,8))
mini2017['Violation Time'].groupby(pd.TimeGrouper(freq='60Min')).count().plot(kind='bar');
plt.tick_params(labelbottom='on')
plt.ylabel('No. of cars', fontsize=16);
plt.xlabel('Day Time', fontsize=16);
plt.title('Parking ticket given at different time of the day', fontsize=16);


Seems like most parking tickets are given between **8:00am** to **2:00**pm.

## Parking ticket vs county

In [None]:
violation_county = mini2017['Violation County'].value_counts()

plt.figure(figsize=(12,8))
f = sns.barplot(y=violation_county.values, x=violation_county.index, alpha=0.6)
# remove labels
plt.tick_params(labelbottom='on')
plt.ylabel('No. of cars', fontsize=16);
plt.xlabel('County', fontsize=16);
plt.title('Parking ticket given in different counties', fontsize=16);

In [None]:
nan