In [1]:
# Import Dependencies
import pandas as pd
import datetime as dt

In [2]:
# Read CTA Data
cta_data = pd.read_csv("CTA_-_Ridership_-_Daily_Boarding_Totals.csv")

# Rename CTA Columns
cta_data.columns = ["Date", "Day Type", "Bus", "Rail Boardings", "Total Rides"]

# Dropping All Duplicates
cta_data.drop_duplicates(subset ="Date", keep = "first", inplace = True)

# Assign Date Column to a Variable
all_dates_cta = cta_data["Date"]

# Convert All Dates to a Datetime 
[dt.datetime.strptime(x, "%m/%d/%Y") for x in all_dates_cta]

# Replace Date Column with Datetime Values
cta_data["Date"] = pd.to_datetime(all_dates_cta)

# Filter Out Data From After 2019
cta_data = cta_data[cta_data["Date"].dt.year <= 2019]

# Display CTA Data
cta_data

Unnamed: 0,Date,Day Type,Bus,Rail Boardings,Total Rides
0,2001-01-01,U,297192,126455,423647
1,2001-01-02,W,780827,501952,1282779
2,2001-01-03,W,824923,536432,1361355
3,2001-01-04,W,870021,550011,1420032
4,2001-01-05,W,890426,557917,1448343
...,...,...,...,...,...
6996,2019-12-27,W,552198,445835,998033
6997,2019-12-28,A,394869,298646,693515
6998,2019-12-29,U,315428,251105,566533
6999,2019-12-30,W,544713,480721,1025434


In [3]:
# Identify Incomplete Rows
cta_data.count()

Date              6939
Day Type          6939
Bus               6939
Rail Boardings    6939
Total Rides       6939
dtype: int64

In [4]:
# Read Crime Data
crime_data = pd.read_csv("Crimes_-_2001_to_present.csv")

# Display Crime Data
crime_data

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,25021,JD204167,04/07/2020 05:09:00 PM,002XX S STATE ST,0110,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,...,42.0,32.0,01A,1176417.0,1899156.0,2020,04/14/2020 03:49:15 PM,41.878639,-87.627691,"(41.878638996, -87.627691486)"
1,23431,JA340827,07/09/2017 02:36:00 PM,000XX W 95TH ST,0110,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,...,21.0,49.0,01A,1177762.0,1841949.0,2017,11/12/2019 03:55:53 PM,41.721627,-87.624485,"(41.721627204, -87.624485177)"
2,24644,JC347217,07/13/2019 03:10:00 PM,001XX W CERMAK RD,0110,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,...,25.0,34.0,01A,1175616.0,1889758.0,2019,07/20/2019 04:13:10 PM,41.852868,-87.630915,"(41.852868298, -87.63091491)"
3,19823,HT223608,03/29/2011 08:11:00 AM,009XX W FULLERTON AVE,0110,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,...,43.0,7.0,01A,1169577.0,1916141.0,2011,03/19/2019 04:11:22 PM,41.925398,-87.652311,"(41.925398449, -87.652311296)"
4,20824,HW194557,03/16/2013 12:13:00 PM,003XX E 43RD ST,0110,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,...,3.0,38.0,01A,1178941.0,1876568.0,2013,03/19/2019 04:11:22 PM,41.816599,-87.619114,"(41.816598679, -87.619113763)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110575,9997762,HY187975,03/11/2015 09:00:00 AM,004XX S WESTERN AVE,0820,THEFT,$500 AND UNDER,CTA TRAIN,False,True,...,2.0,28.0,06,1160478.0,1897731.0,2015,02/10/2018 03:50:01 PM,41.875073,-87.686256,"(41.875073359, -87.68625556)"
110576,9997920,HY187954,03/16/2015 04:15:00 PM,001XX W LAKE ST,0870,THEFT,POCKET-PICKING,CTA TRAIN,True,False,...,42.0,32.0,06,1175301.0,1901735.0,2015,02/10/2018 03:50:01 PM,41.885741,-87.631712,"(41.885741029, -87.631711749)"
110577,9997961,HY188039,03/16/2015 04:15:00 PM,001XX W LAKE ST,1150,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,CTA TRAIN,False,False,...,42.0,32.0,11,1175301.0,1901735.0,2015,02/10/2018 03:50:01 PM,41.885741,-87.631712,"(41.885741029, -87.631711749)"
110578,9999035,HY189070,03/17/2015 05:00:00 PM,100XX W OHARE ST,0880,THEFT,PURSE-SNATCHING,CTA TRAIN,False,False,...,41.0,76.0,06,1100658.0,1934241.0,2015,02/10/2018 03:50:01 PM,41.976290,-87.905227,"(41.976290414, -87.905227221)"


In [5]:
# Delete Unwanted Columns
del crime_data["ID"]
del crime_data["Case Number"]
del crime_data["IUCR"]
del crime_data["Beat"]
del crime_data["District"]
del crime_data["Ward"]
del crime_data["Community Area"]
del crime_data["FBI Code"]
del crime_data["X Coordinate"]
del crime_data["Y Coordinate"]
del crime_data["Updated On"]

In [6]:
# Drop Time from Date for All Rows
for x in range(len(crime_data)):
    crime_data.loc[crime_data["Date"] == crime_data["Date"][x], "Date"] = crime_data["Date"][x][0:10]

In [7]:
# Identify Incomplete Rows
crime_data.count()

Date                    110580
Block                   110580
Primary Type            110580
Description             110580
Location Description    110580
Arrest                  110580
Domestic                110580
Year                    110580
Latitude                109808
Longitude               109808
Location                109808
dtype: int64

In [8]:
# Drop All Rows With Missing Information
crime_data = crime_data.dropna(how='any')

# Verify Dropped Rows
crime_data.count()

# Display Crime Data
crime_data

Unnamed: 0,Date,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude,Location
0,04/07/2020,002XX S STATE ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2020,41.878639,-87.627691,"(41.878638996, -87.627691486)"
1,07/09/2017,000XX W 95TH ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2017,41.721627,-87.624485,"(41.721627204, -87.624485177)"
2,07/13/2019,001XX W CERMAK RD,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2019,41.852868,-87.630915,"(41.852868298, -87.63091491)"
3,03/29/2011,009XX W FULLERTON AVE,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2011,41.925398,-87.652311,"(41.925398449, -87.652311296)"
4,03/16/2013,003XX E 43RD ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2013,41.816599,-87.619114,"(41.816598679, -87.619113763)"
...,...,...,...,...,...,...,...,...,...,...,...
110575,03/11/2015,004XX S WESTERN AVE,THEFT,$500 AND UNDER,CTA TRAIN,False,True,2015,41.875073,-87.686256,"(41.875073359, -87.68625556)"
110576,03/16/2015,001XX W LAKE ST,THEFT,POCKET-PICKING,CTA TRAIN,True,False,2015,41.885741,-87.631712,"(41.885741029, -87.631711749)"
110577,03/16/2015,001XX W LAKE ST,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,CTA TRAIN,False,False,2015,41.885741,-87.631712,"(41.885741029, -87.631711749)"
110578,03/17/2015,100XX W OHARE ST,THEFT,PURSE-SNATCHING,CTA TRAIN,False,False,2015,41.976290,-87.905227,"(41.976290414, -87.905227221)"


In [9]:
# Assign Date Column to a Variable
all_dates = crime_data["Date"]

# Convert All Dates to a Datetime 
[dt.datetime.strptime(x, "%m/%d/%Y") for x in all_dates]

# Replace Date Column with Datetime Values
crime_data["Date"] = pd.to_datetime(all_dates)

# Display Modified DataFrame
crime_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Date,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude,Location
0,2020-04-07,002XX S STATE ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2020,41.878639,-87.627691,"(41.878638996, -87.627691486)"
1,2017-07-09,000XX W 95TH ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2017,41.721627,-87.624485,"(41.721627204, -87.624485177)"
2,2019-07-13,001XX W CERMAK RD,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2019,41.852868,-87.630915,"(41.852868298, -87.63091491)"
3,2011-03-29,009XX W FULLERTON AVE,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2011,41.925398,-87.652311,"(41.925398449, -87.652311296)"
4,2013-03-16,003XX E 43RD ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2013,41.816599,-87.619114,"(41.816598679, -87.619113763)"
...,...,...,...,...,...,...,...,...,...,...,...
110575,2015-03-11,004XX S WESTERN AVE,THEFT,$500 AND UNDER,CTA TRAIN,False,True,2015,41.875073,-87.686256,"(41.875073359, -87.68625556)"
110576,2015-03-16,001XX W LAKE ST,THEFT,POCKET-PICKING,CTA TRAIN,True,False,2015,41.885741,-87.631712,"(41.885741029, -87.631711749)"
110577,2015-03-16,001XX W LAKE ST,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,CTA TRAIN,False,False,2015,41.885741,-87.631712,"(41.885741029, -87.631711749)"
110578,2015-03-17,100XX W OHARE ST,THEFT,PURSE-SNATCHING,CTA TRAIN,False,False,2015,41.976290,-87.905227,"(41.976290414, -87.905227221)"


In [10]:
# Filter Out Data From After 2019
crime_data = crime_data[crime_data["Date"].dt.year <= 2019]

# Display Modified DataFrame
crime_data

Unnamed: 0,Date,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude,Location
1,2017-07-09,000XX W 95TH ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2017,41.721627,-87.624485,"(41.721627204, -87.624485177)"
2,2019-07-13,001XX W CERMAK RD,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2019,41.852868,-87.630915,"(41.852868298, -87.63091491)"
3,2011-03-29,009XX W FULLERTON AVE,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2011,41.925398,-87.652311,"(41.925398449, -87.652311296)"
4,2013-03-16,003XX E 43RD ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2013,41.816599,-87.619114,"(41.816598679, -87.619113763)"
5,2008-07-27,011XX S STATE ST,HOMICIDE,FIRST DEGREE MURDER,"CTA ""L"" PLATFORM",True,False,2008,41.868165,-87.627440,"(41.868165405, -87.62743954)"
...,...,...,...,...,...,...,...,...,...,...,...
110575,2015-03-11,004XX S WESTERN AVE,THEFT,$500 AND UNDER,CTA TRAIN,False,True,2015,41.875073,-87.686256,"(41.875073359, -87.68625556)"
110576,2015-03-16,001XX W LAKE ST,THEFT,POCKET-PICKING,CTA TRAIN,True,False,2015,41.885741,-87.631712,"(41.885741029, -87.631711749)"
110577,2015-03-16,001XX W LAKE ST,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,CTA TRAIN,False,False,2015,41.885741,-87.631712,"(41.885741029, -87.631711749)"
110578,2015-03-17,100XX W OHARE ST,THEFT,PURSE-SNATCHING,CTA TRAIN,False,False,2015,41.976290,-87.905227,"(41.976290414, -87.905227221)"


In [11]:
# Merge CTA and Crime Data
data_complete = pd.merge(cta_data, crime_data, how="outer", on=["Date", "Date"])

# Convert to DataFrame
project_data = pd.DataFrame(data_complete)

# Display New DataFrame
project_data

Unnamed: 0,Date,Day Type,Bus,Rail Boardings,Total Rides,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude,Location
0,2001-01-01,U,297192,126455,423647,002XX N LARAMIE AV,CRIMINAL DAMAGE,TO STATE SUP PROP,CTA BUS,False,False,2001,41.884383,-87.755360,"(41.884382699, -87.755359535)"
1,2001-01-01,U,297192,126455,423647,046XX N BROADWAY,ROBBERY,STRONGARM - NO WEAPON,CTA PLATFORM,False,False,2001,41.965917,-87.657969,"(41.965917251, -87.657969065)"
2,2001-01-01,U,297192,126455,423647,015XX W JARVIS AV,CRIMINAL DAMAGE,TO PROPERTY,CTA PLATFORM,True,False,2001,42.015918,-87.669069,"(42.015918091, -87.669068759)"
3,2001-01-01,U,297192,126455,423647,003XX N STATE ST,ROBBERY,STRONGARM - NO WEAPON,CTA PLATFORM,False,False,2001,41.888112,-87.628175,"(41.888112342, -87.628174848)"
4,2001-01-01,U,297192,126455,423647,033XX W BELMONT AV,CRIMINAL DAMAGE,TO VEHICLE,CTA PLATFORM,False,False,2001,41.939177,-87.712270,"(41.939176878, -87.712269825)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108381,2019-12-31,W,511289,431677,942966,0000X E ROOSEVELT RD,BATTERY,SIMPLE,CTA STATION,False,False,2019,41.867428,-87.626269,"(41.867428235, -87.626269143)"
108382,2019-12-31,W,511289,431677,942966,011XX W GRANVILLE AVE,THEFT,OVER $500,CTA TRAIN,False,False,2019,41.994568,-87.659057,"(41.994567578, -87.65905678)"
108383,2019-12-31,W,511289,431677,942966,0000X N DEARBORN ST,THEFT,POCKET-PICKING,CTA TRAIN,False,False,2019,41.882382,-87.629413,"(41.882381731, -87.629412971)"
108384,2019-12-31,W,511289,431677,942966,0000X N DEARBORN ST,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,CTA TRAIN,False,False,2019,41.882382,-87.629413,"(41.882381731, -87.629412971)"


In [12]:
# Identify Incomplete Rows
project_data.count()

Date                    108386
Day Type                108386
Bus                     108386
Rail Boardings          108386
Total Rides             108386
Block                   108386
Primary Type            108386
Description             108386
Location Description    108386
Arrest                  108386
Domestic                108386
Year                    108386
Latitude                108386
Longitude               108386
Location                108386
dtype: int64

In [13]:
# Replace Mislabelled Rows
project_data["Location Description"] = project_data["Location Description"].replace({'CTA "L" TRAIN': "CTA TRAIN"})
project_data["Location Description"] = project_data["Location Description"].replace({'CTA "L" PLATFORM': "CTA PLATFORM"})
project_data["Location Description"] = project_data["Location Description"].replace({"CTA SUBWAY STATION": "CTA STATION"})

# Investiigate Crime Count for Each Location Description
project_data["Location Description"].value_counts()

CTA PLATFORM                   37320
CTA TRAIN                      26224
CTA BUS                        22593
CTA GARAGE / OTHER PROPERTY    10177
CTA BUS STOP                    6732
CTA STATION                     5188
CTA TRACKS - RIGHT OF WAY        148
CTA PROPERTY                       4
Name: Location Description, dtype: int64

In [14]:
# Get names of indexes for which column Age has value 30
garage_op = project_data[project_data["Location Description"] == "CTA GARAGE / OTHER PROPERTY" ].index
tracks = project_data[project_data["Location Description"] == "CTA TRACKS - RIGHT OF WAY" ].index
cta_prop = project_data[project_data["Location Description"] == "CTA PROPERTY" ].index
drop_bus = project_data[project_data["Location Description"] == "CTA BUS" ].index
drop_bus_stop = project_data[project_data["Location Description"] == "CTA BUS STOP" ].index

# Delete these row indexes from dataFrame
project_data.drop(garage_op, inplace=True)
project_data.drop(tracks, inplace=True)
project_data.drop(cta_prop, inplace=True)
project_data.drop(drop_bus, inplace=True)
project_data.drop(drop_bus_stop, inplace=True)

# Display Finished DataFrame
project_data

Unnamed: 0,Date,Day Type,Bus,Rail Boardings,Total Rides,Block,Primary Type,Description,Location Description,Arrest,Domestic,Year,Latitude,Longitude,Location
1,2001-01-01,U,297192,126455,423647,046XX N BROADWAY,ROBBERY,STRONGARM - NO WEAPON,CTA PLATFORM,False,False,2001,41.965917,-87.657969,"(41.965917251, -87.657969065)"
2,2001-01-01,U,297192,126455,423647,015XX W JARVIS AV,CRIMINAL DAMAGE,TO PROPERTY,CTA PLATFORM,True,False,2001,42.015918,-87.669069,"(42.015918091, -87.669068759)"
3,2001-01-01,U,297192,126455,423647,003XX N STATE ST,ROBBERY,STRONGARM - NO WEAPON,CTA PLATFORM,False,False,2001,41.888112,-87.628175,"(41.888112342, -87.628174848)"
4,2001-01-01,U,297192,126455,423647,033XX W BELMONT AV,CRIMINAL DAMAGE,TO VEHICLE,CTA PLATFORM,False,False,2001,41.939177,-87.712270,"(41.939176878, -87.712269825)"
5,2001-01-01,U,297192,126455,423647,013XX W 61 ST,ASSAULT,SIMPLE,CTA PLATFORM,True,False,2001,41.783289,-87.659218,"(41.783289439, -87.659218039)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108381,2019-12-31,W,511289,431677,942966,0000X E ROOSEVELT RD,BATTERY,SIMPLE,CTA STATION,False,False,2019,41.867428,-87.626269,"(41.867428235, -87.626269143)"
108382,2019-12-31,W,511289,431677,942966,011XX W GRANVILLE AVE,THEFT,OVER $500,CTA TRAIN,False,False,2019,41.994568,-87.659057,"(41.994567578, -87.65905678)"
108383,2019-12-31,W,511289,431677,942966,0000X N DEARBORN ST,THEFT,POCKET-PICKING,CTA TRAIN,False,False,2019,41.882382,-87.629413,"(41.882381731, -87.629412971)"
108384,2019-12-31,W,511289,431677,942966,0000X N DEARBORN ST,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,CTA TRAIN,False,False,2019,41.882382,-87.629413,"(41.882381731, -87.629412971)"


In [15]:
# Save Finished DataFrame for Project Use
project_data.to_csv(r"project_data.csv", index = False)

In [16]:
# Save Modified CTA Data
cta_data.to_csv(r"cta_data.csv", index = False)