# TextToLocation

In [1]:
!export LC_CTYPE=en_US.UTF-8

## Requirements

In [2]:
# Get the model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Pre trained model
!unzip '/content/drive/MyDrive/Trained_Model.zip' -d '/content/Trained_Model'

Archive:  /content/drive/MyDrive/Trained_Model.zip
   creating: /content/Trained_Model/content/Trained Model/
   creating: /content/Trained_Model/content/Trained Model/attribute_ruler/
  inflating: /content/Trained_Model/content/Trained Model/attribute_ruler/patterns  
   creating: /content/Trained_Model/content/Trained Model/lemmatizer/
   creating: /content/Trained_Model/content/Trained Model/lemmatizer/lookups/
  inflating: /content/Trained_Model/content/Trained Model/lemmatizer/lookups/lookups.bin  
   creating: /content/Trained_Model/content/Trained Model/tagger/
  inflating: /content/Trained_Model/content/Trained Model/tagger/model  
  inflating: /content/Trained_Model/content/Trained Model/tagger/cfg  
  inflating: /content/Trained_Model/content/Trained Model/config.cfg  
   creating: /content/Trained_Model/content/Trained Model/senter/
  inflating: /content/Trained_Model/content/Trained Model/senter/model  
 extracting: /content/Trained_Model/content/Trained Model/senter/cfg  


#Production line code

In [4]:
!pip install geopandas



In [5]:
import spacy
import geopy
from geopy.geocoders import Nominatim
import requests
import csv
import geopandas as gpd
import pandas as pd
import zipfile

In [6]:
class TextToLocation:
  def __init__(self, model_path):
    """
    model_path: Path of the pre-trained model
    """
    self.model = spacy.load(model_path)

  def get_contents(*args):
    contents = []
    for input in args:
        # If the input is a file path, open the file and append its contents to the list of contents
        if isinstance(input, (bytes, str)) and input.endswith('.txt'):
            with open(input, 'r') as f:
                file_contents = f.read()
                contents = file_contents.split('\n')

        # If the input is a ZIP file, extract the contents of its text files and append them to the list of contents
        elif isinstance(input, (bytes, str)) and input.endswith('.zip'):
            with zipfile.ZipFile(input, 'r') as z:
                for filename in z.namelist():
                    if filename.endswith('.txt'):
                        contents.append(z.read(filename).decode())

        # If the input is a string, append it to the list of contents
        elif isinstance(input, str):
            contents.append(input)

    return contents


  # Create a output table
  def create_table(self, contents, filename="output.csv", hasLabels=True):
    """
    Gives out a csv file with Name, Date, Time, Area, City, District, State,
    Country as Column names
    """
    # Dictionary to store data
    table = {"NAME":[], "DATE":[], "TIME":[], "AREA":[], "CITY": [],
            "DISTRICT":[], "STATE":[], "COUNTRY":[], "ORGANISATION":[], "INCIDENT":[]}

    if hasLabels:
      for d in content:
        # Single sample
        row = {"NAME":[], "DATE":[], "TIME":[], "AREA":[], "CITY": [],
            "DISTRICT":[], "STATE":[], "COUNTRY":[], "ORGANISATION":[], "INCIDENT":[]}

        text = d
        pred = self.model(text)
        for ent in pred.ents:
          row[ent.label_].append(ent.text)

        # Append sample to table
        for k in table:
          table[k].append(row[k])
    else:
      for d in data:
        # Single sample
        row = {"NAME":[], "DATE":[], "TIME":[], "AREA":[], "CITY": [],
            "DISTRICT":[], "STATE":[], "COUNTRY":[], "ORGANISATION":[], "INCIDENT":[]}

        pred = self.model(d)
        for ent in pred.ents:
          row[ent.label_].append(ent.text)

        # Append sample to table
        for k in table:
          table[k].append(row[k])

    df = pd.DataFrame(table)

    df.to_csv(filename)
    return df
    def destrip_table(df):
      #Removing "[]" from table
      df = df.apply(lambda x: x.astype(str).str.strip("[]"))
      return df
  # Geocoding of CSV by concatenating different entities
  def geocode(in_file):
    with open(in_file, 'r', encoding="utf-8") as f_in:
        csv_reader = csv.reader(f_in)

        # we are using OpenStreetmaps Geocoder
        geolocator = Nominatim(user_agent="Garudaltics")

        # Removing heading
        header = next(csv_reader)
        #header = destrip_table(pd.DataFrame(header)).iloc[0].tolist()
        with open('geocoded_data.csv', 'w', newline='') as f_out:
            csv_writer = csv.writer(f_out)

            # Adding lat, long headings
            header.append("latitude")
            header.append("longitude")
            csv_writer.writerow(header)

            # Concatenating Area, city, district, State, and Country
            for row in csv_reader:
                loc = f"{row[4]} {row[5]} {row[6]} {row[7]} {row[8]}"
                loc = loc.replace("[", "").replace("]", "").replace("'", "")
                # Geocoder on loc
                location = geolocator.geocode(loc, timeout=10000)

                # Removing low level entities one by one to get geocoding
                if not location:
                    loc1 = f"{row[5]} {row[6]} {row[7]}"
                    loc1 = loc1.replace("[", "").replace("]", "").replace("'", "")
                    location = geolocator.geocode(loc1, timeout=10000)

                    if not location:
                        loc2 = f"{row[6]} {row[7]}"
                        loc2 = loc2.replace("[", "").replace("]", "").replace("'", "")
                        location = geolocator.geocode(loc2, timeout=10000)

                        if not location:
                            loc3 = row[7]
                            loc3 = loc3.replace("[", "").replace("]", "").replace("'", "")
                            location = geolocator.geocode(loc3, timeout=10000)

                            if not location:
                                #pass
                                print("Location not found")
                                continue
                            else:
                                print((location.latitude, location.longitude))
                        else:
                            print((location.latitude, location.longitude))
                    else:
                        print((location.latitude, location.longitude))
                else:
                    print((location.latitude, location.longitude))

                # Adding lat, long values to same row
                row.append(location.latitude)
                row.append(location.longitude)
                csv_writer.writerow(row)
    print("Done.")

  def export(infile):
    df = pd.read_csv(infile)
    # convert the DataFrame to a GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
    gdf.crs = "EPSG:4326"
    gdf.to_file("point_data.shp", driver="ESRI Shapefile")

In [8]:
model = TextToLocation('/content/Trained_Model/content/Trained Model')
content = model.get_contents('/content/Reports_final_doc.txt')
TextToLocation.create_table(model, contents = content,filename="output.csv")
TextToLocation.geocode("/content/output.csv")
TextToLocation.export("/content/geocoded_data.csv")

(27.9254195, 96.1647135)
(25.5379432, 91.2999102)
(23.837628, 91.2805664)
Location not found
Done.


  gdf.to_file("point_data.shp", driver="ESRI Shapefile")
