## The below script loads data from Calls_for_service file and merges with data from max_cfs_ucr_categories file.
Following constraints are applied:
- Records year range = 2012 - 2016
- Records Disposition = RTF 
- Records CrimeType = Violent Crime

In [1]:
import os
import csv
import zipfile
import string
import pandas as pd
import datetime as dt
import numpy as np

## Data Loading

### Load Calls-for-Service Data

In [2]:
# Set location of file
path = os.path.join("../Datasets/Raw_Data/Calls_for_Service/")
path

'../Datasets/Raw_Data/Calls_for_Service/'

In [3]:
# Get filenames
filenames = os.listdir(path)
filenames

['Calls_for_Service_2013.zip',
 'Calls_for_Service_2012.zip',
 'Calls_for_Service_2016.zip',
 'Calls_for_Service_2015.zip',
 'Calls_for_Service_2014.zip']

In [4]:
# Load data from files in list
dfs = []
for f in filenames:
    zf = zipfile.ZipFile(os.path.join(path,f)) 
    dfs.append(pd.read_csv(zf.open(str.replace(f, 'zip', 'csv')),))

# Merge all df in list
cfs_df = pd.concat(dfs, ignore_index=True)

# Change datatype of column Type_ to String
cfs_df.Type_ = cfs_df.Type_.apply(str)

In [5]:
# Display top 5 rows
cfs_df.head()

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
0,A0000113,94,DISCHARGING FIREARM,2B,,,,3696313,533332,12/31/2012 23:59,...,1/1/2013 0:23,1/1/2013 0:24,UNF,UNFOUNDED,,,052XX Burgundy St,70117.0,5,"(29.960019973022543, -90.02123092953371)"
1,A0000213,94,DISCHARGING FIREARM,2B,,,,3710263,518976,12/31/2012 23:59,...,1/1/2013 0:14,1/1/2013 0:21,UNF,UNFOUNDED,,,029XX Bacchus Dr,70131.0,4,"(29.92009950776069, -89.97771660629039)"
2,A0000313,67S,SHOPLIFTING,1C,,,,3683068,531830,1/1/2013 0:00,...,,1/1/2013 0:33,DUP,DUPLICATE,,,006XX Decatur St,70130.0,8,"(29.956300375853115, -90.06310829672566)"
3,A0000413,21,COMPLAINT OTHER,1H,,,,3673396,533473,1/1/2013 0:00,...,1/1/2013 0:00,1/1/2013 0:13,NAT,NECESSARY ACTION TAKEN,,,007XX S White St,70119.0,1,"(29.961109350200267, -90.09359315760538)"
4,A0000513,62A,"BURGLAR ALARM, SILEN",2C,,,,3665197,544507,1/1/2013 0:00,...,,1/1/2013 0:36,VOI,VOID,,,055XX Cherlyn Dr,70124.0,3,"(29.991690537674508, -90.11911498388514)"


In [6]:
# Number of records
len(cfs_df)

2252907

In [7]:
# Column Names
cfs_df.columns

Index([u'NOPD_Item', u'Type_', u'TypeText', u'Priority', u'InitialType',
       u'InitialTypeText', u'InitialPriority', u'MapX', u'MapY', u'TimeCreate',
       u'TimeDispatch', u'TimeArrive', u'TimeClosed', u'Disposition',
       u'DispositionText', u'SelfInitiated', u'Beat', u'BLOCK_ADDRESS', u'Zip',
       u'PoliceDistrict', u'Location'],
      dtype='object')

### Filter Records with Disposition == 'RTF' 

In [8]:
cfs_df = cfs_df[cfs_df.Disposition.str.strip() == 'RTF']

In [9]:
# Number of records
len(cfs_df)

428570

In [10]:
# Display top 5 rows
cfs_df.head()

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,TimeArrive,TimeClosed,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location
23,A0002413,21,COMPLAINT OTHER,1H,,,,3686582,535344,1/1/2013 0:05,...,1/1/2013 0:30,1/1/2013 2:14,RTF,REPORT TO FOLLOW,,,Dauphine St & Saint Roch Ave,70117.0,8,"(29.96585508211896, -90.05188819386355)"
34,A0003513,966,DRUG VIOLATIONS,1G,,,,3681184,531226,1/1/2013 0:07,...,,1/1/2013 0:08,RTF,REPORT TO FOLLOW,,,002XX Bourbon St,70112.0,8,"(29.954696873099614, -90.06907837799429)"
39,A0004013,21,COMPLAINT OTHER,1H,,,,3682054,532371,1/1/2013 0:08,...,,1/1/2013 0:11,RTF,REPORT TO FOLLOW,,,006XX Bourbon St,70112.0,8,"(29.95781881107571, -90.06629127670833)"
45,A0004713,966,DRUG VIOLATIONS,1G,,,,3681184,531226,1/1/2013 0:09,...,1/1/2013 0:29,1/1/2013 0:55,RTF,REPORT TO FOLLOW,,,002XX Bourbon St,70112.0,8,"(29.954696873099614, -90.06907837799429)"
57,A0006013,21,COMPLAINT OTHER,1H,,,,3681624,531805,1/1/2013 0:12,...,,1/1/2013 0:13,RTF,REPORT TO FOLLOW,,,004XX Bourbon St,70112.0,8,"(29.95627557056932, -90.06766883413155)"


### Load Categories Data

In [11]:
# Set location of file
fname = "../Datasets/Raw_Data/MAX_CFS_UCR_Categories.xlsx"

# Load file
crime_types = pd.read_excel(fname,sheetname='Sheet1')

# Select required columns
crime_types = crime_types.ix[:,['Code','UCR MAIN','Description']]

# Rename columns
crime_types.rename(columns={'Code':'Type_','UCR MAIN':'CrimeType','Description':'Description'},inplace=True)

# Change datatype of column Type_ to String
crime_types.Type_ = crime_types.Type_.apply(str)
crime_types.Type_ = crime_types.Type_.str.strip()

### Filter Records with CrimeType == 'VIOLENT CRIME'

In [12]:
crime_types = crime_types[crime_types.CrimeType.str.strip() == 'VIOLENT CRIME']

In [13]:
crime_types.head()

Unnamed: 0,Type_,CrimeType,Description
71,30,VIOLENT CRIME,HOMICIDE
72,30C,VIOLENT CRIME,HOMICIDE BY CUTTING
73,30D,VIOLENT CRIME,HOMICIDE DOMESTIC
74,30S,VIOLENT CRIME,HOMICIDE BY SHOOTING
75,34,VIOLENT CRIME,AGGRAVATED BATTERY


### Merge Calls-for-Service with Categories data

In [14]:
merged_df = pd.merge(cfs_df,crime_types,on='Type_',how='inner')

In [15]:
merged_df.head()

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location,CrimeType,Description
0,A0015113,34,AGGRAVATED BATTERY,2B,,,,3683068,531830,1/1/2013 0:56,...,RTF,REPORT TO FOLLOW,,,006XX Decatur St,70130.0,8,"(29.956300375853115, -90.06310829672566)",VIOLENT CRIME,AGGRAVATED BATTERY
1,A0247713,34,AGGRAVATED BATTERY,2B,,,,3683521,540346,1/2/2013 17:21,...,RTF,REPORT TO FOLLOW,,,029XX Pauger St,70119.0,5,"(29.979702616404193, -90.06137937316943)",VIOLENT CRIME,AGGRAVATED BATTERY
2,A3100313,34,AGGRAVATED BATTERY,2B,,,,3682102,529887,1/24/2013 1:25,...,RTF,REPORT TO FOLLOW,,,001XX Tchoupitoulas St,70130.0,8,"(29.950987194039513, -90.06622644536927)",VIOLENT CRIME,AGGRAVATED BATTERY
3,A3302913,34,AGGRAVATED BATTERY,2B,,,,3694972,559141,1/25/2013 16:17,...,RTF,REPORT TO FOLLOW,,,078XX W Laverne St,70126.0,7,"(30.031027412521865, -90.02453459898635)",VIOLENT CRIME,AGGRAVATED BATTERY
4,A3475413,34,AGGRAVATED BATTERY,2B,,,,3681217,530712,1/26/2013 17:32,...,RTF,REPORT TO FOLLOW,,,Canal St & Royal St,70130.0,8,"(29.953282545767404, -90.0689920854026)",VIOLENT CRIME,AGGRAVATED BATTERY


In [16]:
len(merged_df)

13954

### Filter Records with CrimeType = NaN or Null

In [17]:
# Check length of records with CrimeType Null
len(merged_df[merged_df.CrimeType.isnull()])

0

In [18]:
# Check length of records with CrimeType Not Null
len(merged_df[merged_df.CrimeType.notnull()])

13954

In [19]:
# Remove records with CrimeType Null
cfs_final = merged_df[merged_df.CrimeType.notnull()]

In [20]:
cfs_final.head()

Unnamed: 0,NOPD_Item,Type_,TypeText,Priority,InitialType,InitialTypeText,InitialPriority,MapX,MapY,TimeCreate,...,Disposition,DispositionText,SelfInitiated,Beat,BLOCK_ADDRESS,Zip,PoliceDistrict,Location,CrimeType,Description
0,A0015113,34,AGGRAVATED BATTERY,2B,,,,3683068,531830,1/1/2013 0:56,...,RTF,REPORT TO FOLLOW,,,006XX Decatur St,70130.0,8,"(29.956300375853115, -90.06310829672566)",VIOLENT CRIME,AGGRAVATED BATTERY
1,A0247713,34,AGGRAVATED BATTERY,2B,,,,3683521,540346,1/2/2013 17:21,...,RTF,REPORT TO FOLLOW,,,029XX Pauger St,70119.0,5,"(29.979702616404193, -90.06137937316943)",VIOLENT CRIME,AGGRAVATED BATTERY
2,A3100313,34,AGGRAVATED BATTERY,2B,,,,3682102,529887,1/24/2013 1:25,...,RTF,REPORT TO FOLLOW,,,001XX Tchoupitoulas St,70130.0,8,"(29.950987194039513, -90.06622644536927)",VIOLENT CRIME,AGGRAVATED BATTERY
3,A3302913,34,AGGRAVATED BATTERY,2B,,,,3694972,559141,1/25/2013 16:17,...,RTF,REPORT TO FOLLOW,,,078XX W Laverne St,70126.0,7,"(30.031027412521865, -90.02453459898635)",VIOLENT CRIME,AGGRAVATED BATTERY
4,A3475413,34,AGGRAVATED BATTERY,2B,,,,3681217,530712,1/26/2013 17:32,...,RTF,REPORT TO FOLLOW,,,Canal St & Royal St,70130.0,8,"(29.953282545767404, -90.0689920854026)",VIOLENT CRIME,AGGRAVATED BATTERY


In [21]:
cfs_final.Type_.unique()

array(['34', '34S', '65', '34C', '42', '30S', '55', '30C', '64G', '64J',
       '37', '30', '37D', '64', '43', '64K', '42M', '65J', '34D', '30D',
       '42B', '43M'], dtype=object)

In [22]:
len(cfs_final)

13954

In [23]:
cfs_final.TimeArrive = pd.to_datetime(cfs_final.TimeArrive)

In [24]:
cfs_final.TimeClosed = pd.to_datetime(cfs_final.TimeClosed)
cfs_final.TimeCreate = pd.to_datetime(cfs_final.TimeCreate)
cfs_final.TimeDispatch = pd.to_datetime(cfs_final.TimeDispatch)

In [25]:
# Save the data
fullpath = "../Datasets/Final_Data/Calls_for_Service.csv"
cfs_final.to_csv(fullpath, sep=',',  index = False)

In [26]:
cfs_final.columns

Index([u'NOPD_Item', u'Type_', u'TypeText', u'Priority', u'InitialType',
       u'InitialTypeText', u'InitialPriority', u'MapX', u'MapY', u'TimeCreate',
       u'TimeDispatch', u'TimeArrive', u'TimeClosed', u'Disposition',
       u'DispositionText', u'SelfInitiated', u'Beat', u'BLOCK_ADDRESS', u'Zip',
       u'PoliceDistrict', u'Location', u'CrimeType', u'Description'],
      dtype='object')

In [27]:
# count the number of crimes within each day
crime_date = pd.to_datetime(cfs_final.TimeCreate)
year = crime_date.dt.year
month = crime_date.dt.month
day = crime_date.dt.day
crime_date = pd.to_datetime(year*10000 + month*100 + day, format='%Y%m%d')

crime_count = pd.DataFrame({'Zip': cfs_final.Zip,
                            'Date': crime_date,
                           }, index = None)


# # crime_sum = pd.DataFrame(crime_sum)

In [28]:
crime_sum = pd.DataFrame({'count' : crime_count.groupby(['Zip','Date']).size()}).reset_index()


In [29]:
crime_sum.Zip = crime_sum.Zip.astype(int)
crime_sum.head()

Unnamed: 0,Zip,Date,count
0,70112,2012-01-01,2
1,70112,2012-01-02,2
2,70112,2012-01-05,1
3,70112,2012-01-06,1
4,70112,2012-01-08,1


# Merging with Stop and Search

In [30]:
s_n_s_file_path = "../Datasets/Raw_Data/Stop_and_Search/Stop_and_Search__Field_Interviews_.csv"

In [31]:
sns_df = pd.read_csv(s_n_s_file_path) 

  interactivity=interactivity, compiler=compiler, result=result)


In [32]:
sns_df.head()

Unnamed: 0,FieldInterviewID,NOPD_Item,EventDate,District,Zone,OfficerAssignment,StopDescription,ActionsTaken,VehicleYear,VehicleMake,...,SubjectWeight,SubjectEyeColor,SubjectHairColor,SubjectDriverLicState,CreatedDateTime,LastModifiedDateTime,Longitude,Latitude,Zip,BlockAddress
0,17415,,01/01/2010 01:11:00 AM,6,E,6th District,TRAFFIC VIOLATION,,2005.0,DODGE,...,160.0,Brown,Black,LA,01/01/2010 01:26:26 AM,,0.0,0.0,,
1,17416,,01/01/2010 02:06:00 AM,5,D,5th District,CALL FOR SERVICE,,,,...,140.0,Brown,Black,,01/01/2010 02:27:38 AM,,0.0,0.0,,
2,17416,,01/01/2010 02:06:00 AM,5,D,5th District,CALL FOR SERVICE,,,,...,145.0,Brown,Black,,01/01/2010 02:27:38 AM,,0.0,0.0,,
3,17416,,01/01/2010 02:06:00 AM,5,D,5th District,CALL FOR SERVICE,,,,...,140.0,Brown,Black,,01/01/2010 02:27:38 AM,,0.0,0.0,,
4,17416,,01/01/2010 02:06:00 AM,5,D,5th District,CALL FOR SERVICE,,,,...,140.0,Brown,Black,,01/01/2010 02:27:38 AM,,0.0,0.0,,


In [33]:
# number of stops and searches made
new_sns_df =  sns_df[~pd.isnull(sns_df.Zip)]
new_sns_df= new_sns_df.reset_index(drop=True)



In [34]:
sns_date = pd.to_datetime(new_sns_df.EventDate)


In [35]:
year = sns_date.dt.year
month = sns_date.dt.month
day = sns_date.dt.day
sns_date = pd.to_datetime(year*10000 + month*100 + day, format='%Y%m%d')


In [36]:
sns_count = pd.DataFrame({'Zip': new_sns_df.Zip,
                            'Date': sns_date,
                           }, index = None)

In [37]:
sns_count.head()

Unnamed: 0,Date,Zip
0,2010-10-31,70115.0
1,2011-02-09,70122.0
2,2011-03-29,70127.0
3,2011-03-29,70127.0
4,2011-03-29,70126.0


In [38]:
sns_count.Zip = sns_count.Zip.astype(int)


In [39]:
stops_sum = pd.DataFrame({'count' : sns_count.groupby(['Zip','Date']).size()}).reset_index()

In [40]:
stops_sum.head()

Unnamed: 0,Zip,Date,count
0,70112,2001-03-03,1
1,70112,2010-01-03,1
2,70112,2010-07-31,1
3,70112,2011-05-05,2
4,70112,2011-05-06,15


In [41]:
def countCrimeStopInTimeWindow(crimeData, stopData, timeFrame, timeWindow):
    # timeFrame - timeWindow
    # month - month / 2 weeks / week
    # 2 weeks - 2 weeks / week / day
    # week - week / day
    stopDate = []
    crimeSum = []
    stopSum = []
    
    if timeFrame == "month":
        if timeWindow == "month":
            window = 30
        elif timeWindow == "2 weeks":
            window = 15
        elif timeWindow == "week":
            window = 7
        for year in range(2012, 2017):
            for month in range(1, 13):
                stopStart = stopData.index.searchsorted(dt.datetime(year, month, 1))
                crimeStart = stopStart
                if month in [1, 3, 5, 7, 8, 10, 12]:
                    stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 31))
                elif month in [4, 6, 9, 11]:
                    stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 30))
                else:
                    stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 28))
                crimeEnd = stopEnd + window
                stopDate.append(str(year) + "-" + str(month))
                stopSum.append(int(sum(stopData.ix[stopStart:stopEnd].values)))
                crimeSum.append(int(sum(crimeData.ix[crimeStart:crimeEnd].values)))
                
    
    if timeFrame == "2 weeks":
        if timeWindow == "2 weeks":
            window = 15
        elif timeWindow == "week":
            window = 7
        elif timeWindow == "day":
            window = 1
        for year in range(2012, 2017):
            for month in range(1, 13):
                for half in [1, 2]:
                    stopStart = stopData.index.searchsorted(dt.datetime(year, month, 1 + 15 * (half - 1)))
                    crimeStart = stopStart
                    if half == 1:
                        stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 15))
                    else:
                        if month in [1, 3, 5, 7, 8, 10, 12]:
                            stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 31))
                        elif month in [4, 6, 9, 11]:
                            stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 30))
                        else:
                            stopEnd = stopData.index.searchsorted(dt.datetime(year, month, 28))
                    crimeEnd = stopEnd + window
                    stopDate.append(str(year) + "-" + str(month) + "-" + str(half))
                    stopSum.append(int(sum(stopData.ix[stopStart:stopEnd].values)))
                    crimeSum.append(int(sum(crimeData.ix[crimeStart:crimeEnd].values)))
                    
                    
    if timeFrame == "week":
        numWeek = int(len(crimeData) / 7)
        if timeWindow == "week":
            window = 7
        if timeWindow == "day":
            window = 1
        for year in range(2012, 2017):
            for week in range(numWeek):
                stopStart = stopData.index.searchsorted(dt.datetime(2012, 1, 2)) + 7 * (week - 1)
                crimeStart = stopStart
                stopEnd = stopStart + 7
                crimeEnd = stopEnd + window
                stopDate.append(str(year) + "-" + str(week))
                stopSum.append(int(sum(stopData.ix[stopStart:stopEnd].values)))
                crimeSum.append(int(sum(crimeData.ix[crimeStart:crimeEnd].values)))
                               
    return [stopDate, crimeSum, stopSum]

In [48]:
# fill seaborn scatter here

In [44]:
def get_all_correlations(crimes_count,stops_count,zip_code):
    
    month_month = countCrimeStopInTimeWindow(crimes_count, stops_count, "month", "month")
    month_twoWeeks = countCrimeStopInTimeWindow(crimes_count, stops_count, "month", "2 weeks")
    month_week = countCrimeStopInTimeWindow(crimes_count, stops_count, "month", "week")
    twoWeeks_twoWeeks = countCrimeStopInTimeWindow(crimes_count, stops_count, "2 weeks", "2 weeks")
    twoWeeks_week = countCrimeStopInTimeWindow(crimes_count, stops_count, "2 weeks", "week")
    twoWeeks_day = countCrimeStopInTimeWindow(crimes_count, stops_count, "2 weeks", "week")
    week_week = countCrimeStopInTimeWindow(crimes_count, stops_count, "week", "week")
    week_day = countCrimeStopInTimeWindow(crimes_count, stops_count, "week", "day")
    
    
    
    
    fig['layout'].update(height=600, width=600,
                     title='Calls For Service')
    
    py.iplot(fig, filename='Calls For Service')
#     data = [trace1,trace2, trace3,trace4,trace5,trace6,trace7,trace8]
    
    
    
    mm_cor = np.corrcoef(month_month[1], month_month[2])[0, 1]
    mtw_cor = np.corrcoef(month_twoWeeks[1], month_twoWeeks[2])[0, 1]
    mw_cor = np.corrcoef(month_week[1], month_week[2])[0, 1]
    twtw_cor = np.corrcoef(twoWeeks_twoWeeks[1], twoWeeks_twoWeeks[2])[0, 1]
    tww_cor = np.corrcoef(twoWeeks_week[1], twoWeeks_week[2])[0, 1]
    twd_cor = np.corrcoef(twoWeeks_day[1], twoWeeks_day[2])[0, 1]
    ww_cor = np.corrcoef(week_week[1], week_week[2])[0, 1]
    wwd_cor = np.corrcoef(week_day[1], week_day[2])[0, 1]
    
    return [(zip_code,[mm_cor,mtw_cor,mw_cor,mw_cor,twtw_cor,tww_cor,twd_cor,ww_cor,wwd_cor])]

In [None]:
# print(mm_cor, mtw_cor, mw_cor, twtw_cor, tww_cor, twd_cor, ww_cor, wwd_cor)

In [47]:
def correlation_for_each_zip(crime_count,stop_count):
    unique_zips = crime_count['Zip'].unique()
    for i in unique_zips:
        zip_1_crimes = crime_sum[crime_sum.Zip==i]
        zip_1_crimes = crime_sum[['Date','count']]
        zip_1_crimes = zip_1_crimes.set_index(['Date'])
        zip_1_stops = stops_sum[stops_sum.Zip==i]
        zip_1_stops = zip_1_stops[['Date','count']]
        zip_1_stops = zip_1_stops.set_index(['Date'])
        print get_all_correlations(crimes_count=zip_1_crimes,stops_count=zip_1_stops,zip_code=i)
        
        
        

In [46]:
correlation_for_each_zip(crime_count=crime_sum,stop_count=stops_sum)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]  [ (1,4) x4,y4 ]
[ (2,1) x5,y5 ]  [ (2,2) x6,y6 ]  [ (2,3) x7,y7 ]  [ (2,4) x8,y8 ]

[(70112, [0.2130708405143367, 0.18048123582078063, 0.17722943813208059, 0.17722943813208059, 0.14893049397787328, 0.14511053611707697, 0.14511053611707697, -0.10754265084858974, -0.098334949078266565])]
This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]  [ (1,4) x4,y4 ]
[ (2,1) x5,y5 ]  [ (2,2) x6,y6 ]  [ (2,3) x7,y7 ]  [ (2,4) x8,y8 ]

[(70113, [0.28710821020780602, 0.31867965658942227, 0.38301585974124303, 0.38301585974124303, 0.16319834158331248, 0.18338384680889483, 0.18338384680889483, -0.11131960414332438, -0.097504468375460521])]
This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]  [ (1,4) x4,y4 ]
[ (2,1) x5,y5 ]  [ (2,2) x6,y6 ]  [ (2,3) x7,y7 ]  [ (2,4) x8,y8 ]

[(70114, [0.3411499759094519, 0.28405987900667568, 0.27370961071920424, 0.

In [None]:
# stripping NOPD ITem to match exactly
sns_df.NOPD_Item = sns_df.NOPD_Item.str.strip()

In [None]:
cfs_df = pd.read_csv(fullpath) 

In [None]:
cfs_df.NOPD_Item = cfs_df.NOPD_Item.str.strip()

In [None]:
cfs_sns_df = pd.merge(cfs_df, sns_df, how='inner', on='NOPD_Item')

In [None]:
len(cfs_sns_df),len(cfs_sns_df.NOPD_Item.unique())

So we have 2062 field interviews followed by CFS - RTF report for violent crimes. Out of which some of the field searches are done repeatedly for the same NOPD_ITEM

In [None]:
cfs_sns_path = "..\\Datasets\\Final_Data\\CFS_SNS.csv"
cfs_sns_df.to_csv(cfs_sns_path, sep=',',  index = False)

In [None]:
# starting new again
path = "..\\Datasets\\Final_Data\\Calls_for_Service.csv"

In [None]:
cfs_df = pd.read_csv(path)