In [1]:
import re 
import pandas as pd 
import numpy as np 
import get_data as gd 
import yaml as ym
from datetime import datetime
import nltk as nk 
import spacy as sp
from nltk.tag.stanford import StanfordNERTagger as st



## Functions 

In [2]:
def words_in_all_articles(search_words, all_articles): 
    """
    Take a list of search words and find them in the dataframe of articles 
    where the dataframe contains a column called content which contains the 
    text of the article

    This returns a list containing indices of the articles where the search
    words are present and a list containing the text of the relevant articles 

    """
    articles= []
    for num in range(0, len(all_articles)):
        single_page = ""
        art_content = all_articles.iloc[num].content
        single_page = single_page.join(art_content)
        single_page = single_page.replace("\n", " ")

        in_count = 0

        for word in search_words: 
            if word in single_page: 
                in_count += 1

        if in_count > 0: 
            contains_list.append(num)
            articles.append(single_page.strip())
            
    return contains_list, articles


## Generate dataset


In [3]:
# importing the config file with its values 
with open("config.yaml", 'r') as yaml_file:
        config_file = ym.safe_load(yaml_file)

#location of data file 
file_loc = config_file["file_loc"]        

#months dictionary
months = config_file["months"]
months_list = list(months.keys())
        

In [4]:
# checking the number of available articles 
auth_datetime, url_data, content_data = gd.organize_data(file_loc)
print("auth_datetime_length {}" .format( len(auth_datetime)))

auth_datetime_length 2604


In [5]:
# isolating the datetime and author info
datetime_info = []
for x in auth_datetime: 
    auth_time_info = ""
    auth_time_info = auth_time_info.join(x)
    auth_time_info = auth_time_info.lower()
    datetime_obj = auth_time_info.split('updated:')[-1]
    datetime_info.append(datetime_obj)
    


In [6]:
# using datetime package to convert string date and time to datetime object
row = []
new_datetime_format = ["None" for _ in range(len(auth_datetime))]

for indx, dt in enumerate(datetime_info): 
        dt = dt.replace("ist", "").strip().replace(",","-").replace(" ", "")
        if dt:
            for x in months_list: 
                find_val = dt.find(x)
                if find_val != -1: 
                    dt = dt.replace(x,str(months[x])+'-')
                    dd = datetime.strptime(dt, '%m-%d-%Y-%H:%M')
                    new_datetime_format[indx] = dd
                    row.append(indx)

       

In [7]:
# parsing datetime object to get date time features
all_years = ["None" for _ in range(len(auth_datetime))]
all_days = ["None" for _ in range(len(auth_datetime))]
all_months = ["None" for _ in range(len(auth_datetime))]
all_hours =["None" for _ in range(len(auth_datetime))]
all_mins = ["None" for _ in range(len(auth_datetime))]

for indx, x in enumerate(new_datetime_format):
    if x != "None": 
        all_years[indx] = x.year
        all_days[indx] = x.day
        all_months[indx] = x.month
        all_hours[indx] = x.hour
        all_mins[indx] = x.minute

In [8]:
# generating a dataframe to store all the data 

all_data = {"auth_datetime": auth_datetime, 
            "url_data": url_data,
            "content": content_data,
            "year":all_years,
            "month": all_months,
            "day": all_days, 
            "hour": all_hours, 
            "minutes": all_mins}

all_data_df  = pd.DataFrame(data = all_data)
all_data_df.head(100)


Unnamed: 0,auth_datetime,url_data,content,year,month,day,hour,minutes
0,"[Bangalore Mirror Bureau , |, Updated: May 26...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, \n, This could very well be a new , . A m...",2019,5,26,5,0
1,"[Bangalore Mirror Bureau , |, Updated: May 19...",https://bangaloremirror.indiatimes.com/bangalo...,[The Central Crime Branch (CCB) has busted a 1...,2019,5,19,3,0
2,"[Bangalore Mirror Bureau , |, Updated: May 20...",https://bangaloremirror.indiatimes.com/bangalo...,"[By Pragna L Krupa\n, \n, Sneaky sneakers aka ...",2019,5,20,20,20
3,"[By , Praveen Kumar, Praveen Kumar, , Bangalor...",https://bangaloremirror.indiatimes.com/bangalo...,"[The 30-year-old , who had attacked a flight ...",2019,5,17,15,54
4,"[By , Praveen Kumar, Praveen Kumar, , Bangalor...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, A 26-year-old auto driver has been arrest...",2019,5,18,6,0
5,"[Bangalore Mirror Bureau , |, Updated: May 18...",https://bangaloremirror.indiatimes.com/bangalo...,"[Union minister , and his party lawmaker , o...",2019,5,18,6,0
6,"[Bangalore Mirror Bureau , |, Updated: May 18...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, \n, An , was beaten up by a gang of four...",2019,5,18,6,0
7,[],https://bangaloremirror.indiatimes.com/bangalo...,[],,,,,
8,"[Bangalore Mirror Bureau , |, Updated: May 20...",https://bangaloremirror.indiatimes.com/bangalo...,"[A man waiting for a bus was robbed at , by t...",2019,5,20,4,0
9,"[Bangalore Mirror Bureau , |, Updated: May 19...",https://bangaloremirror.indiatimes.com/bangalo...,"[Police nabbed two persons, who were seen acti...",2019,5,19,3,0


In [9]:
# checking how many years of data we have
all_data_df.year.value_counts()

2018    896
2017    794
2019    418
2016    409
None     87
Name: year, dtype: int64

## Find IPC codes and sections


In [10]:
contains_list =  []

search_words = ["section","IPC","Section"]

article_indices, articles_text = words_in_all_articles(search_words, all_data_df)



In [11]:
len(article_indices)

692

In [12]:
section_df = all_data_df.iloc[article_indices]

section_df.iloc[0].content

['\n',
 'A 26-year-old auto driver has been arrested and remanded in judicial custody for ',
 ' a 19-year-old woman under the pretext of dropping her home. \n',
 'The accused has been identified as Sidda, 26, residing in ',
 ' Southern area. The incident came to light as it was a ',
 ' and the police recorded her statements last week at the Sanjay Gandhi Institute of Trauma and Orthopaedic Hospital. \n',
 '\n',
 'In her statement to the police, the woman has stated that around 6 pm after work she had gone to Yediyur with her friend a few days back. She was on her way towards Yelachenahalli metro station to return home when the accused accosted her and offered to drop her home. However, instead of dropping her home, the driver allegedly took her to different parts of the city.\n',
 '\n',
 '“The woman has stated that the driver took her to Harinagar, Gottigere, Bannerghatta Road and other places. After reaching Konanakuntte cross, he is alleged to have driven towards Byanapalya asking he

In [13]:
section_df

Unnamed: 0,auth_datetime,url_data,content,year,month,day,hour,minutes
4,"[By , Praveen Kumar, Praveen Kumar, , Bangalor...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, A 26-year-old auto driver has been arrest...",2019,5,18,6,0
8,"[Bangalore Mirror Bureau , |, Updated: May 20...",https://bangaloremirror.indiatimes.com/bangalo...,"[A man waiting for a bus was robbed at , by t...",2019,5,20,4,0
13,"[Bangalore Mirror Bureau , |, Updated: May 20...",https://bangaloremirror.indiatimes.com/bangalo...,"[Fourteen stolen , and , of Rs 82,000 face v...",2019,5,20,6,0
23,"[By , Praveen Kumar, Praveen Kumar, , Bangalor...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, \n, Three techies were robbed by a three-...",2019,5,25,5,0
26,"[By , Praveen Kumar, Praveen Kumar, , Bangalor...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, \n, Four years ago, Pavithra and her husb...",2019,5,14,6,0
27,"[Bangalore Mirror Bureau , |, Updated: May 14...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, \n, A motorist was badly bruised near Mad...",2019,5,14,6,0
31,"[Bangalore Mirror Bureau , |, Updated: May 13...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, \n, A 51-year-old businessman was assault...",2019,5,13,8,53
32,"[Bangalore Mirror Bureau , |, Updated: May 13...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, \n, A 24-year-old techie was being consta...",2019,5,13,6,0
38,"[Bangalore Mirror Bureau , |, Updated: May 8,...",https://bangaloremirror.indiatimes.com/bangalo...,"[A 31-year-old driver was , by a group of men...",2019,5,8,6,0
39,"[Bangalore Mirror Bureau , |, Updated: May 8,...",https://bangaloremirror.indiatimes.com/bangalo...,"[\n, \n, If you thought crimes happen in dark ...",2019,5,8,4,0


### Print sentences with search words in them 


In [14]:



articles_df =section_df
for num in range(0, len(articles_df)): 
    relevant_sentences= []
    
    single_page =""
    art_content = articles_df.iloc[num].content
    single_page =  single_page.join(art_content)
    single_page = single_page.replace("\n"," ").strip()
    sentences = nk.sent_tokenize(single_page)


    for sent in sentences:
        for word in search_words: 
            if word in sent: 
                relevant_sentences.append(sent)
    
    data = {'sentences': relevant_sentences, 
                'sentence_index': [num for x in range(len(relevant_sentences)) ]}

    if num == 0: 
        relevant_sentences_df = pd.DataFrame(data= data)
    else: 
        relevant_sentences_df.append(data, ignore_index = True)
        


In [15]:
relevant_sentences_df

Unnamed: 0,sentences,sentence_index
0,The police have registered a case of sexual ha...,0


In [16]:
len(articles_df)

692

In [17]:
data

{'sentences': ['The police have registered a case of abetment of suicide under Section 306 of  against the victim’s wife.'],
 'sentence_index': [691]}

## Test extracting locations


In [18]:
single_page =""
re = all_data_df.iloc[num].content
single_page =  single_page.join(all_data_df.iloc[18].content)
single_page = single_page.replace("\n"," ").strip()
single_page

"A car driver, who questioned a biker for ramming into his vehicle’s bumper, was thrashed by a gang of seven local men with rods and hollow blocks on Saturday evening in . The gang even vandalised his car and broke his right leg with a hollow brick. The victim was rushed to a nearby hospital and has been in the ICU for the past three days.   The victim has been identified as Murthy N, 28, a resident of KR Puram, who works at a private company as a driver. His brother, Somashekar, told  Mirror that on Saturday evening when Murthy was on his way to deliver a customer's car around 6 pm, a bike hit the car’s bumper from behind on Kodichikkanahalli Main Road in Bilekahalli. Murthy got down from the car and stopped the two-wheeler to ask the rider about it and an argument broke out between the two. The biker immediately called his friends and asked them to come to the spot. He said that while the two were arguing, around six to seven men joined the bike rider and started assaulting Murthy.  

In [19]:
nk.pos_tag(nk.word_tokenize(nk.sent_tokenize(single_page)[2]))

[('The', 'DT'),
 ('victim', 'NN'),
 ('was', 'VBD'),
 ('rushed', 'VBN'),
 ('to', 'TO'),
 ('a', 'DT'),
 ('nearby', 'JJ'),
 ('hospital', 'NN'),
 ('and', 'CC'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('ICU', 'NNP'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('past', 'JJ'),
 ('three', 'CD'),
 ('days', 'NNS'),
 ('.', '.')]

In [20]:
np.ones([10,1])

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [21]:
string_value ="The victim has been identified as Murthy N, 28, a resident of KR Puram, who works at a private company as a driver. His brother, Somashekar, told  Mirror that on Saturday evening when Murthy was on his way to deliver a customer's car around 6 pm, a bike hit the car’s bumper from behind on Kodichikkanahalli Main Road in Bilekahalli."
tagged = nk.word_tokenize(string_value)

In [22]:
from nltk.tag import StanfordNERTagger


In [23]:
stanford_classifier = 'D:/Data_science/nltk_stanford/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'D:/Data_science/nltk_stanford/stanford-ner-2015-12-09/stanford-ner.jar'

In [24]:
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')


In [26]:
import os
java_path = "C:/Program Files/Java/jdk-12.0.1/bin"
os.environ['JAVAHOME'] = java_path

text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

# tokenized_text = nk.word_tokenize(string_val)
classified_text = st.tag(tagged)


CRFClassifier invoked on Thu Jun 06 16:19:13 PDT 2019 with arguments:
   -loadClassifier D:/Data_science/nltk_stanford/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz -textFile C:\Users\Pawan\AppData\Local\Temp\tmp3pl3toqn -outputFormat slashTags -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions "tokenizeNLs=false" -encoding utf-8
tokenizerOptions="tokenizeNLs=false"
loadClassifier=D:/Data_science/nltk_stanford/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz
encoding=utf-8
outputFormat=slashTags
textFile=C:\Users\Pawan\AppData\Local\Temp\tmp3pl3toqn
tokenizerFactory=edu.stanford.nlp.process.WhitespaceTokenizer
Exception in thread "main" java.lang.NoClassDefFoundError: org/slf4j/LoggerFactory
	at edu.stanford.nlp.io.IOUtils.<clinit>(IOUtils.java:42)
	at edu.stanford.nlp.ie.AbstractSequenceClassifier.loadClassifier(AbstractSequenceClassifier.java:1484)
	at edu.stanford.nlp.ie.AbstractSequenceClassifier.loa

OSError: Java command failed : ['C:/Program Files/Java/jdk-12.0.1/bin\\java.exe', '-mx1000m', '-cp', 'D:/Data_science/nltk_stanford/stanford-ner-2015-12-09/stanford-ner.jar', 'edu.stanford.nlp.ie.crf.CRFClassifier', '-loadClassifier', 'D:/Data_science/nltk_stanford/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', '-textFile', 'C:\\Users\\Pawan\\AppData\\Local\\Temp\\tmp3pl3toqn', '-outputFormat', 'slashTags', '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerOptions', '"tokenizeNLs=false"', '-encoding', 'utf-8']

In [22]:
nlp = sp.load("en_core_web_sm")


ValueError: spacy.syntax.nn_parser.Parser size changed, may indicate binary incompatibility. Expected 72 from C header, got 64 from PyObject