This file is going to show how we decided what categories our messages fall into. It will also show how we assigned each message to a category. The assigning of messages to categories is also shown in the ETL for the file, 'Subway_Alerts_ETL'. 

In [1]:
import pandas as pd
from azure.storage.blob import BlobClient, BlobServiceClient, ContainerClient
from config import BLOB_URL, CONTAINER, STOR_ACCT


We need to read in the file and assign it headers. 

In [4]:
# Assigning column header names
column_names = ['Alert_Code', 'DateTime', 'Agency', 'Title', 'Message', 'Train_Line', 'Borough']

# Reading in the data 
blob_client1 = BlobClient.from_blob_url('https://{STOR_ACCT}.blob.core.windows.net/{CONTAINER}/Nonupdates_Active_Borough_Train_Subway_Alerts.csv/part-00000-tid-2334004524103334285-aee0707e-bc17-434d-9d5e-7e56df1da108-713-1-c000.csv?{SAS_TOKEN}')
full_alerts = pd.read_csv(blob_client1.download_blob(), names = column_names, header=None)




Our next step is to figure out the underlying topics in the "Messages" column. We will use an LDA model to provide us with topics and keywords per topic. We then had to sort through the resulting topics to decide which categories made the most sense based on the model and the data itself. We narrowed our categories down to 7 topics. We utilized stop words, lemmatization, and stemming to provide us with the most accurate topics possible. 

In [5]:
# Import the neccessary modules
from gensim import corpora, models
from gensim.corpora import Dictionary
import gensim
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer


# We needed to drop all the nulls before the model could be run 
full_alerts.dropna(subset=["Message"], inplace=True)

# Tokenize the messages
messages = full_alerts["Message"].apply(lambda x: x.split())

# created the stop_words variable which was set to a pre-loaded stop words list
stop_words = set(stopwords.words("english"))

# I created my own list of stopwords
my_stopwords = {"with", "at", "are", "of", "because", "and", "to", "has", "or", "A", "St,", "-", "St", "PM", "AM", "Av,",
                "St.", "Northbound", "Southbound", "PM,", "5", "Mon", "Fri,", "E", "42", "180", "Canal", "C", "6", "14", "Central-42", "4",
                "St-Herald", "Brooklyn", "St-Union", "D", "59", "L", "Av", "8", "Broadway", "Av.", "Junction", "Myrtle-Wyckoff", "30", "2",
                "3", "Some", "St-Grand", "southbound", "149", "All", "nevins", "111", "Rd", "7", "Crown", "F", "Jackson", "Hts-Roosevelt",
                "Queens", "Bay", "18", "20", "Av-138", "Pelham", "Pkwy", "Stillwell", "Hunts", "Utrecht", "St-Washington", "St-Columbus", "1", "Mon,"
                "use", "AM", "For", "free", "N,", "D,", "New", "Lots", "6:30", "Q,", "Ctr,", "Blvd,", "Due", "Ridge", "Bowling", "Hall.", "Bridge-City",
                "G", "Church", "137", "Avs.", "St-City", "East", "Court", "9", "SIR", "DAYS", "Prince's", "Arthur", "Pkwy,", "Rockaway", "Blvd", "104", "bus"
                "Park", "use", "These", "trip.", "one", "1,", "116", "Cortlandt", "207", "Avenue", "3,", "The", "Sutphin", "Av-149", "Point", "W", "sq", "Sq.",
                "Jay", "Sq,", "34", "Hamilton", "Fort", "J", "M", "Myrtle", "Broad", "Delancey", "St-Essex", "Atlantic", "Av-Barclays", "Franklin", "Ctr", "Ctr."
                "Hts-Utica", "Flatbush", "Bedford-Nostrand", "local", "Sq", "M", "Expect", "northbound", "B", "Coney", "Q", "B", "Island-Stillwell", "N", "Kings",
                "Island-bound", "Prospect", "Brighton", "R", "N", "36", "57", "DeKalb", "If", "145", "168", "Euclid", "Utica", "Park", "Hoyt-Schermerhorn", "St-bound"
                "125", "Grand", "Circle", "Circle.", "86", "West", "Junius", "103", "65", "86", "A,", "2,", "C,", "Astoria-Ditmars", "Mon,", "To", "Between", "Metropolitan",
                "Fulton", "From", "Trade", "World", "Center", "Bedford", "Eastchester-Dyre", "Prince", "79", "St-Woodside", "Kingsbridge", "Blvd.", "We", "Our", "St-MetroTech",
                "Van", "York", "High", "bus", "There", "As", "Steinway", "Plaza", "125", "Queensboro", "E,", "F,", "Plaza.", "St-Hudson", "Yards", "buses", "shuttle",
                "Free", "No", "Shuttle", "27", "N.", "10:15", "13", "Vernon", "Blvd-Jackson", "4,", "City", "Saturday", "Jefferson", "Tottenville-bound", "AM", "11:45",
                "Nevins", "Chambers", "Concourse", "135", "Concourse", "161", "Central", "138", "96", "Times", "Sq-42", "St-7", "Lexington", "72", "Manhattan", "Park", "Clark",
                "Green", "AM,", "Manhattan-bound", "Jamaica", "71", "Roosevelt", "21", "Center-bound", "28", "Blvd-Lehman", "Bronx-bound", "Beach", "Av-9", "Bus", "Park."
                "50", "Authority", "St-Port", "Far", "155", "Forest", "Hills-71", "Hts-Utica", "Junction", "Please", "This", "mta.info", "511", "MetroCard", "Sheepshead", "49"
                "buses.", "Essex", "Circle", "Lorimer", "South", "Ferry", "Junction.", "81", "Dyckman", "Bridge", "47-50", "Sts-Rockefeller", "Jamaica-bound", "F.", "St.Queensbridge",
                "Parkchester", "Jamaica-Van", "Whitlock", "Tottenville", "Woodhaven", "74", "Mets-Willets", "Kew", "Gardens-Union", "St-Queensbridge", "St-Broadway",
                "Bergen", "Brooklyn", "3:30", "Flushing", "St-Penn", "Wall", "Ctr.", "Park.", "Siclen", "110", "45", "61", "Neptune", "Concourse,", "Hunters", "R.", "Hall",
                "Borough", "Burnside", "Green.", "Eastern", "St-bound", "Main", "Flushing-Main", "33", "LIRR", "67", "(BKLYN).", "Center.", "Smith-9", "St-Rawson", "Asoria-bound",
                "Use", "NIGHTS", "2.", "1.", "two", "Island", "Saturday,", "Richmond", "George-bound", "Plaza", "St-MetroTech.", "Park-bound", "Broadway.", "Parkside", "Con", "Bridge-bound",
                "Bleecker", "Center-Parsons/Archer.", "Reminder:", "&", "Park-242", "215", "Hills", "St-Queensbridge.", "Fri", "121", "Hwy", ".", "You", "Feb", "Ocean", "(Brooklyn).",
                "Plaza", "12", "Av-53", "Pond", "Center", "Allow", "Plaza", "may", "M,", "Note:", "Av-Rutland", "Parkchester", "Elmhurst", "Hill", "Gun", "Wakefield-241", "Baychester",
                "Tremont", "Av-Brooklyn", "Morris", "Westchester", "68", "Parsons", "191", "53", "astoria-bound", "z", "75", "sutter", "170", "st-broadway", "to/from",
                "rd.", "st-yankee", "canarsie-rockaway", "square-42", "b,", "hwy.", "st-wash", "square", "23", "rd.", "rockaway-mott", "(bklyn)", "b,",
                "rd.", "college", "bronx", "25", "whitehall", "z", "uptown", "effect:", "st-yankee", "181", "st-wash", "av-59", "b,", "brooklyn.", "50", "62", "follows:", "sutter", "livonia",
                "167", "pk", "whitehall", "hewes", "square", "heights-roosevelt", "square-42", "square.", "tue", "11", "sat", "j,", "december", "green,", "30,", "st-wash", "av-63",
                "hall,", "rector", 'st-bryant', "st-metrotech", "77", "2018", "10:45", "sun", "college,", "nyct", "2017", "marcy", "rockaway-mott", "39", "q56", "31", "college.", "crescent",
                "tpke", "apr", "tue", "10", "or,", "b63", "astoria-bound", "bus.", "hwy,", "pkwy.", "marcy", "routes:", "157", "hoyt", "49", "rd.", "pl.", "broadway-lafayette", "astor", "12:01", "wed",
                "4/5", "(110", "st).", "s", "canarsie-rockaway", "blvd-archer", "howard", "11:30", "hewes", "n/q/r", "avs", "u", "pkwi"}

# Joined the two stop words lists  
stop_words = stop_words.union(my_stopwords)


messages = [[word for word in message if word not in stop_words]
            for message in messages]

# Convert the words in the messages to lowercase
lowercase_messages = [[word.lower() for word in message]
                      for message in messages]

# Initialized the lemmatizer
lemmatizer = WordNetLemmatizer()
# Initialized the stemmer 
stemmer = PorterStemmer()

# lemmatized the messages
lemmatized_messages = [[lemmatizer.lemmatize(word).replace(
    ".", "") for word in message] for message in lowercase_messages]

# Stemmed the messages
stemmed_messages = [[stemmer.stem(word).replace(
    ",", "") for word in message] for message in lemmatized_messages]


# Create a dictionary from the stemmed messages
dictionary = corpora.Dictionary(stemmed_messages)

# Create a bag-of-words representation of the stemmed messages
bow_corpus = [dictionary.doc2bow(message) for message in stemmed_messages]

# Train the LDA model
lda_model = models.LdaModel(bow_corpus, num_topics=14, passes=100,
                            iterations=100, id2word=dictionary, random_state=10)

# Print the top 10 keywords for each topic
for index, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(index+1, topic))



Topic: 1 
Words: 0.194*"track" + 0.136*"train" + 0.133*"mainten" + 0.098*"delay" + 0.097*"run" + 0.048*"person" + 0.048*"direct" + 0.037*"left" + 0.031*"unauthor" + 0.009*"square-42"
Topic: 2 
Words: 0.080*"due" + 0.062*"transfer" + 0.050*"station" + 0.040*"train" + 0.036*"take" + 0.035*"skip" + 0.030*"via" + 0.027*"run" + 0.027*"servic" + 0.024*"nearbi"
Topic: 3 
Words: 0.204*"activ" + 0.164*"train" + 0.122*"investig" + 0.118*"delay" + 0.111*"nypd" + 0.078*"brake" + 0.058*"train'" + 0.054*"fdni" + 0.031*"automat" + 0.009*"place"
Topic: 4 
Words: 0.196*"run" + 0.177*"train" + 0.177*"problem" + 0.159*"delay" + 0.135*"signal" + 0.039*"switch" + 0.036*"direct" + 0.034*"bound" + 0.009*"u" + 0.006*"av-59"
Topic: 5 
Words: 0.231*"train" + 0.143*"run" + 0.113*"line" + 0.113*"servic" + 0.098*"express" + 0.044*"along" + 0.038*"stop" + 0.034*"delay" + 0.020*"termin" + 0.016*"bypass"
Topic: 6 
Words: 0.119*"crew" + 0.084*"rail" + 0.071*"train" + 0.071*"condit" + 0.056*"updat" + 0.039*"fix" + 0.03

Now we have 14 different topics to sort through. However, not all of the topics will be useful. By comparing the actual messages with the topics, we decided that topics 1,3,4,10,11, and 13 would be helpful. Below, we create 2 dictionaries which have keywords as keys, and the category as its value. We then wrote a loop that goes through each message and assigns the row to a certain category. The category is then added to a new column called "Message_Category". 

In [10]:
# Define the keywords or phrases that should trigger a certain category
single_keywords = {
    "nypd": "NYPD/FDNY Investigation",
    "police": "NYPD/FDNY Investigation",
    "investigation": "NYPD/FDNY Investigation",
    "investigate": "NYPD/FDNY Investigation",
    "unauthorized person": "NYPD/FDNY Investigation",
    "fdny": "NYPD/FDNY Investigation",
    "fire": "NYPD/FDNY Investigation",
    "assaulted": "NYPD/FDNY Investigation",
    "assault": "NYPD/FDNY Investigation", 
    "disruptive": "NYPD/FDNY Investigation",
    "altercation": "NYPD/FDNY Investigation",
    "unruly passenger": "NYPD/FDNY Investigation",
    "unruly": "NYPD/FDNY Investigation",
    "maintenance": "Train/Track Maintenance",
    "clean": "Train/Track Maintenance", 
    "cleaned": "Train/Track Maintenance",
    "cleaning": "Train/Track Maintenance",
    "switch": "Train/Track Maintenance",
    "replaced rails": "Train/Track Maintenance",
    "replaced a rail": "Train/Track Maintenance",
    "rail replacement": "Train/Track Maintenance",
    "replace rails": "Train/Track Maintenance",
    "replacing rails": "Train/Track Maintenance",
    "rail condition": "Train/Track Maintenance",
    "replace a rail": "Train/Track Maintenance",
    "broken rail": "Train/Track Maintenance",
    "tree on the tracks": "Train/Track Maintenance",
    "debris": "Train/Track Maintenance",
    "garbage": "Train/Track Maintenance",
    "vandalized": "Train/Track Maintenance",
    "vandalism": "Train/Track Maintenance",
    "dirty": "Train/Track Maintenance",
    "track work": "Train/Track Maintenance",
    "from the tracks": "Train/Track Maintenance",
    "track replacement": "Train/Track Maintenance", 
    "work train": "Train/Track Maintenance",
    "rail power": "Train/Track Maintenance",
    "repair": "Train/Track Maintenance",
    "move equipment": "Train/Track Maintenance",
    "track condition": "Train/Track Maintenance",
    "inspection": "Train/Track Maintenance",
    "replacement track": "Train/Track Maintenance",
    "remove": "Train/Track Maintenance",
    "elevators": "Mechanical Issues",
    "mechanical": "Mechanical Issues",
    "emergency brake": "Mechanical Issues",
    "door problem": "Mechanical Issues",
    "malfunction": "Mechanical Issues",
    "power outage": "Mechanical Issues",
    "loss of power": "Mechanical Issues",
    "communication issue": "Mechanical Issues",
    "communications issue": "Mechanical Issues",
    "lighting": "Mechanical Issues",
    "connectivity": "Mechanical Issues",
    "communications problem": "Mechanical Issues",
    "stalled train": "Mechanical Issues",
    "signal": "Signal Issues",
    "sick": "Medical",
    "ems": "Medical",
    "injured": "Medical",
    "injury": "Medical",
    "medical": "Medical",
    "emergency teams": "Medical",
    "emergency crews": "Medical",
    "struck by": "Medical",
    "emergency personel": "Medical",
    "are running on": "Change of Service",
    "are running along": "Change of Service",
    "running express": "Change of Service",
    "for continuing service": "Change of Service"
  
  
    
}
# Same as above
combined_keywords = {
    ("someone", "doors"): "NYPD/FDNY Investigation",
    ("passenger", "doors"): "NYPD/FDNY Investigation",
    ("removed", "tracks"): "Train/Track Maintenance",
    ("remove", "tracks"): "Train/Track Maintenance",
    ("removed", "service"): "Train/Track Maintenance",
    ("remove", "service"): "Train/Track Maintenance",
    ("inspect", "tracks"): "Train/Track Maintenance",
    ("inspected", "tracks"): "Train/Track Maintenance",
    ("isolate", "train"): "Train/Track Maintenance",
    ("removed", "car"): "Train/Track Maintenance",
    ("move", "storage"): "Train/Track Maintenance",
    ("equipment", "work"): "Train/Track Maintenance",
    ("brakes", "activated"): "Mechanical Issues",
    ("brakes", "activate"): "Mechanical Issues",
    ("brakes", "activating"): "Mechanical Issues",
    ("brake's", "activated"): "Mechanical Issues",
    ("loss of", "power"): "Mechanical Issues",
    ("share", "track"): "Change of Service",
    ("sharing", "track"): "Change of Service",   
    ("service", "suspended"): "Change of Service",
    ("divert", "trains"): "Change of Service",
    
}

# Loop through each message in the dataframe
for index, row in full_alerts.iterrows():
    message = row["Message"]
    if pd.isnull(message) or message is None:
        message = ''
    if isinstance(message, str):
        message = message.lower()
    # Initialize the category to None
    category = None
    # Loop through the keywords
    for keyword, cat in single_keywords.items():
        # Check if the keyword is in the message and assign a value
        if keyword in message:
            category = cat
            break
        # Check if the keyword is in the message and assign a value
    for keywords, cat in combined_keywords.items():
        if all(k in message for k in keywords):
            category = cat
            break
    if category is None:
        category = "Miscellaneous"
    full_alerts.at[index, "Message_Category"] = category
