### In this notebook we explore the potential for extracting text from the audio files that can be used to find addresses and other important words and phrases.

#### A potential API for taking partial address information and cross-referencing it with known addresses
https://smartystreets.com/products/apis/us-autocomplete-api

In [92]:
# general packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [185]:
# packages toward nlp

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [114]:
# import the csv's as dataframes

df0 = pd.read_csv('3_dataframes/df_speech_recognition.csv')        # filename
df1 = pd.read_csv('3_dataframes/df_newspeech_recognition.csv')

<font color = blue>Exploring the dataframes</font>

In [116]:
df0.shape

(2898, 4)

In [117]:
df1.shape

(1470, 4)

In [118]:
df0.head()

Unnamed: 0.1,Unnamed: 0,file_name,good_exception,audio_recognition
0,0,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
1,1,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
2,2,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
3,3,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
4,4,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,


In [119]:
df1.head()

Unnamed: 0.1,Unnamed: 0,file_name,good_exception,audio_recognition
0,0,/home/ubuntu/new_audios/Neal Manahan - negativ...,Exception,
1,1,/home/ubuntu/new_audios/Neal Manahan - negativ...,Good,the district maybe we can get a hold of them W...
2,2,/home/ubuntu/new_audios/Neal Manahan - negativ...,Good,West Virginia
3,3,/home/ubuntu/new_audios/Neal Manahan - negativ...,Exception,
4,4,/home/ubuntu/new_audios/Neal Manahan - negativ...,Exception,


<font color = blue>Merging the dataframe from both cases into one dataframe.</font>

In [120]:
df = pd.concat([df0, df1], axis=0)
df.head()

Unnamed: 0.1,Unnamed: 0,file_name,good_exception,audio_recognition
0,0,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
1,1,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
2,2,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
3,3,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
4,4,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,


In [121]:
df.shape

(4368, 4)

In [122]:
# dropping the 'Unnamed: 0' column

df.drop(columns='Unnamed: 0', inplace=True)

<font color = blue>Building a class column in the dataframe.</font>

In [124]:
df.file_name[0]

0    /home/ubuntu/audio_files/Neal Manahan - negati...
0    /home/ubuntu/new_audios/Neal Manahan - negativ...
Name: file_name, dtype: object

In [125]:
pd.options.display.max_columns = 500

In [126]:
print(df.file_name[0])

0    /home/ubuntu/audio_files/Neal Manahan - negati...
0    /home/ubuntu/new_audios/Neal Manahan - negativ...
Name: file_name, dtype: object


In [127]:
# resetting the index in the dataframe

df.reset_index(drop=True, inplace=True)

In [128]:
df.head()

Unnamed: 0,file_name,good_exception,audio_recognition
0,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
1,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
2,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
3,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
4,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,


In [129]:
df.shape

(4368, 3)

In [130]:
df.tail()

Unnamed: 0,file_name,good_exception,audio_recognition
4363,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,argument with girlfriend
4364,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Exception,
4365,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,call Papa
4366,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,Piedmont North Haledon all secure no vehicle m...
4367,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,and I was at Piedmont North Hilton


In [131]:
df.file_name[0]

'/home/ubuntu/audio_files/Neal Manahan - negative_baton_rouge_1_-01.wav'

In [132]:
type(df.file_name[0])

str

In [134]:
# building a dataframe with only five rows

test_df = df.loc[:5]

In [135]:
test_df

Unnamed: 0,file_name,good_exception,audio_recognition
0,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
1,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
2,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
3,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
4,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,
5,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,


In [136]:
test_df['class'] = df.file_name.apply(lambda x: 0 if 'negative' in x else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [137]:
test_df.head()

Unnamed: 0,file_name,good_exception,audio_recognition,class
0,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0
1,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0
2,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0
3,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0
4,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0


In [138]:
df['class'] = df.file_name.apply(lambda x: 0 if 'negative' in x else 1)

In [139]:
pd.options.display.max_rows = 100

In [140]:
df.tail(100)

Unnamed: 0,file_name,good_exception,audio_recognition,class
4268,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Exception,,1
4269,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Exception,,1
4270,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,can I get stolen,1
4271,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Exception,,1
4272,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,breathing looks like maybe,1
4273,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Exception,,1
4274,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,call Judith all-district N64 the following ann...,1
4275,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,contact the White Center to 640-2209,1
4276,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,Channel 43,1
4277,/home/ubuntu/new_audios/Rodolfo Flores Mendez ...,Good,Charlie one five,1


In [141]:
df['class'].value_counts()

0    3457
1     911
Name: class, dtype: int64

In [142]:
df[df['good_exception'] == 'Good'].shape

(1035, 4)

In [143]:
df_text = df[df['good_exception'] == 'Good']
df_text.head()

Unnamed: 0,file_name,good_exception,audio_recognition,class
11,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,3425,0
15,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,phone number for Direct Control of Kendall Street,0
31,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,20174 Perrysville,0
40,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,Medical,0
51,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,p1191 Hyundai sedan,0


In [144]:
df_text['class'].value_counts()

0    661
1    374
Name: class, dtype: int64

In [145]:
df_text['class'].value_counts(normalize=True)

0    0.638647
1    0.361353
Name: class, dtype: float64

<font color = blue>Checking the distribution of the classes amount the original Broadcastify files.</font>

In [146]:
df0['class'] = df0.file_name.apply(lambda x: 0 if 'negative' in x else 1)

In [147]:
df0.head()

Unnamed: 0.1,Unnamed: 0,file_name,good_exception,audio_recognition,class
0,0,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0
1,1,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0
2,2,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0
3,3,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0
4,4,/home/ubuntu/audio_files/Neal Manahan - negati...,Exception,,0


In [148]:
df0.shape

(2898, 5)

In [149]:
df0[df0['good_exception'] == 'Good'].shape

(323, 5)

In [150]:
df0_text = df0[df0['good_exception'] == 'Good']
df0_text.head()

Unnamed: 0.1,Unnamed: 0,file_name,good_exception,audio_recognition,class
11,11,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,3425,0
15,15,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,phone number for Direct Control of Kendall Street,0
31,31,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,20174 Perrysville,0
40,40,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,Medical,0
51,51,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,p1191 Hyundai sedan,0


In [151]:
df0_text['class'].value_counts()

1    197
0    126
Name: class, dtype: int64

In [152]:
df0_text['class'].value_counts(normalize=True)

1    0.609907
0    0.390093
Name: class, dtype: float64

<font color = blue>Build tokens for the audio text.</font>

In [153]:
df_text.loc[11]['audio_recognition']

'3425'

In [154]:
df_text.head()

Unnamed: 0,file_name,good_exception,audio_recognition,class
11,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,3425,0
15,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,phone number for Direct Control of Kendall Street,0
31,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,20174 Perrysville,0
40,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,Medical,0
51,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,p1191 Hyundai sedan,0


In [155]:
df_text.reset_index()

Unnamed: 0,index,file_name,good_exception,audio_recognition,class
0,11,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,3425,0
1,15,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,phone number for Direct Control of Kendall Street,0
2,31,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,20174 Perrysville,0
3,40,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,Medical,0
4,51,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,p1191 Hyundai sedan,0
5,53,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,Brownsville Belle Plaine 3311,0
6,58,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,California,0
7,60,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,California,0
8,68,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,details,0
9,88,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,since yesterday,0


In [156]:
df_reindexed_text = df_text.reset_index(drop=True)

In [157]:
df_reindexed_text.head()

Unnamed: 0,file_name,good_exception,audio_recognition,class
0,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,3425,0
1,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,phone number for Direct Control of Kendall Street,0
2,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,20174 Perrysville,0
3,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,Medical,0
4,/home/ubuntu/audio_files/Neal Manahan - negati...,Good,p1191 Hyundai sedan,0


In [158]:
df_reindexed_text.shape

(1035, 4)

In [159]:
text_list = []
for i in range(df_reindexed_text.shape[0]):
    text_list.append(df_reindexed_text.loc[i]['audio_recognition'])


In [160]:
len(text_list)

1035

In [161]:
text = ''
for i in range(df_reindexed_text.shape[0]):
    text += df_reindexed_text.loc[i]['audio_recognition'] + ' '

In [162]:
len(text)

27886

<font color = blue>First attempt at creating tokens from the text.</font>

In [163]:
tokenizer = RegexpTokenizer(r'\w+')

# spam_tokens = tokenizer.tokenize(spam.lower())
text_tokens = tokenizer.tokenize(text.lower())

In [166]:
len(text_tokens)

5037

In [167]:
text_tokens

['3425',
 'phone',
 'number',
 'for',
 'direct',
 'control',
 'of',
 'kendall',
 'street',
 '20174',
 'perrysville',
 'medical',
 'p1191',
 'hyundai',
 'sedan',
 'brownsville',
 'belle',
 'plaine',
 '3311',
 'california',
 'california',
 'details',
 'since',
 'yesterday',
 'are',
 'there',
 'any',
 'units',
 'in',
 'juniata',
 '324',
 '12',
 '322',
 '6th',
 'avenue',
 'between',
 'liberty',
 'and',
 'street',
 'off',
 'of',
 'weslaco',
 '3324',
 'siri',
 '112',
 '3427',
 '3432',
 'odd',
 'squad',
 'm156',
 '424',
 '3124',
 'vehicle',
 'needs',
 'to',
 'be',
 'moved',
 'the',
 'red',
 'nissan',
 'versa',
 '2136',
 'california',
 '70466',
 '408',
 'urban',
 'residents',
 'inaudible',
 'panic',
 'male',
 'white',
 'male',
 'breakdown',
 '49047',
 'to',
 '49047',
 '108',
 '32',
 'turn',
 'right',
 '2013',
 'honda',
 'all',
 'right',
 'now',
 '644',
 '769',
 '6476',
 'number',
 'one',
 'negative',
 '41901',
 'mariposa',
 'turn',
 'off',
 'alarm',
 'call',
 'davis',
 'market',
 'in',
 'stock

In [168]:
text



In [169]:
# This is the list of text.

text_list

['3425',
 'phone number for Direct Control of Kendall Street',
 '20174 Perrysville',
 'Medical',
 'p1191 Hyundai sedan',
 'Brownsville Belle Plaine 3311',
 'California',
 'California',
 'details',
 'since yesterday',
 'are there any units in Juniata',
 '324',
 '12',
 '322',
 '6th Avenue between Liberty and',
 'Street off of Weslaco',
 '3324',
 'Siri 112',
 '3427',
 '3432',
 'Odd Squad',
 'm156',
 '424',
 '3124',
 'vehicle needs to be moved the red Nissan Versa',
 '2136',
 'California 70466',
 '408',
 'Urban residents inaudible panic',
 'male white male breakdown',
 '49047 to 49047',
 '108',
 '32 turn right 2013 Honda',
 'all right now',
 '644-769-6476',
 'number one',
 'negative',
 '41901 Mariposa',
 'turn off alarm',
 'call Davis Market in Stockton California',
 'California',
 '100-300 F4',
 '46th Street',
 '30 years olds for Harrisonville tan jacket blue jeans',
 'Toyota Prius silver in color',
 'California 665 Honda',
 'neocity',
 'it does not open 901 mine it was from a VoIP phone 

In [193]:
'1 this is a bunny'.count('1')

1

In [194]:
test = '3745 North Eddie'

In [187]:
re.findall(r'[1-9]+', test)

['3745']

In [206]:
# Taking any numbers

number_identified = 0

for i in text_list:
    
    if re.findall(r'[0-9]+', i) != []:
#         print(f'{i}')
        
        number_identified += 1

print()
print(f'The number of Potential Addresses: {number_identified}')
        


The number of Potential Addresses: 354


In [208]:
# Taking numbers with a word following

number_identified = 0

for i in text_list:
    
    if re.findall(r'[0-9]+\s[A-z]+', i) != []:
        print(f'{i}')
        
        number_identified += 1

print()
print(f'The number of Potential Addresses: {number_identified}')

20174 Perrysville
p1191 Hyundai sedan
49047 to 49047
32 turn right 2013 Honda
41901 Mariposa
100-300 F4
30 years olds for Harrisonville tan jacket blue jeans
California 665 Honda
it does not open 901 mine it was from a VoIP phone named for
for Juliet Romeo 297 2012 Nissan Rogue color red it comes out of Everett
365 nv20
parking route 439 robber
1251 East Nelson Avenue District 2126
1531 Patrol
Adam gilby 12279 California DL Baker 514-2791
Channel 7 rescue 21 year old acting violent
Tyrell on a wire rack in Westerville something exploded are in 1/2 hour butter down can I send anybody having an SNL
755 Boylston
a company 4222 okay
turn off the 106 Delta one 1303
Mediacom 41182 Laguna Clay County Library 1 JFK Drive 41 1821 JFK Drive
21 Firethorn
call Net10 105 with CPR at price to Airport Road
call Kali Camaro R1 report Drive I want to keep oil Sinclair between editor and Fairfield
44531 pill
793 Boylston Street Dental Cairo
set alarm at 1 alarm at 1
negative at 1 tell my wife to call
71