In [20]:
import pandas as pd
import random

#regex
import re

#pullenti:
#!pip install pullenti-wrapper
from pullenti_wrapper.processor import (Processor, GEO, ADDRESS)

#####################INPUT###################
# Loading from Goskatalog only columns that we need
fields = ['id', 'productionPlace']
goskatalog = pd.read_csv('part_json_test.csv', skipinitialspace=True, usecols=fields, encoding = 'UTF-8')

# making new variable without NA
places = goskatalog[['id', 'productionPlace']][~goskatalog['productionPlace'].str.contains("неизв|б\.м\.|Б\.м\.|Б\.\sм\.|^\-$|не\sустановл|Б\/м|б\.\sм\.|не\sизвест", case = False, regex=True, na=True)]

# making index normal
places = places.set_index(pd.Series(range(len(places))))

# delete all symbols except letters in Russian, . and - and digits
bad_symbols = '[^\d\-\s\.\,\:\а-я\А-Я]'
places['productionPlace'] = places['productionPlace'].str.replace(bad_symbols, '')

# replace common abbreviations
pattern = '^с\.' #'c.' at the beggining of line 
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'село ')
pattern = '\sс\.' #'c.' after space'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' село ')
pattern = '^(п\.|пос\.|пос\s)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'поселок ')
pattern = '\s(п\.|пос\.|пос\s)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' поселок ')
pattern = '^(г\.|гор\.|гор(\s|$))'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'город ')
pattern = '\s(г\.|гор\.|гор(\s|$))'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' город ')

pattern = '^(д\.|дер\.|дер\s)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'деревня ')
pattern = '\s(д\.|дер\.|дер\s)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' деревня ')

pattern = '^(р\-н\.|р\-н(\s|$)|р\s\-\sн(\s|$)|р\-\sн(\s|$)|р\s\-н(\s|$)|р\-он(\s|$))'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'район ')
pattern = '\s(р\-н\.|р\-н(\s|$)|р\s\-\sн(\s|$)|р\-\sн(\s|$)|р\s\-н(\s|$)|р\-он(\s|$))'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' район ')
pattern = '^губ\.'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'губерния ')
pattern = '\sгуб\.'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' губерния ')
pattern = '^АО\s'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'автономный округ ')
pattern = '\sАО(\s|$)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' автономный округ ')
pattern = '^(обл(\s|$)|обл\.(\s|$))'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'область ')
pattern = '\s(обл(\s|$)|обл\.(\s|$))'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' область ')

# common places in literature: M., СПб.
pattern = '^(М\.$|М$|М\.\:|М\:)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'Москва ')

pattern = '^(СПб\.$|СПб$|СПб\.\:|СПб\:)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'Санкт-Петербург ')
pattern = '^(Л\.$|Л$|Л\.\:|Л\:)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'Ленинград ')
pattern = '^(Пг\.$|Пг$|Пг\.\:|Пг\:)'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, 'Ленинград ')

#add missing whitespace when . is missing
pattern = '(^|\s)(гор|г)([А-Я])'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' город \\3')
pattern = '(^|\s)c([А-Я])'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' село \\2')

# delete all double spaces
pattern = ' +'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, ' ')

# delete all . in the end of lines
pattern = '\.$'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, '')

# delete all whitespaces in the end of lines
pattern = '\s$'
places['productionPlace'] = places['productionPlace'].str.replace(pattern, '')

#capitalize all words
places['productionPlace'] = places['productionPlace'].str.upper()

processor = Processor([GEO, ADDRESS])

text = ""

result_pul = []

not_found = 0

for index, row in places.iterrows():
    text = row['productionPlace']
    newline = (row['id'], 'NaN', 'NaN', text)
    
    if pd.isnull(text):
#        result_pul.append(newline)
        not_found += 1
        continue
    
    try:
        res = processor(text)
        
        if len(res.matches) == 0:
#            result_pul.append(newline)
             not_found += 1
        else:
            for j in range(0,len(res.matches)):
                match = res.matches[j]
                for slot in match:
    #             for h in range(0, len(match.referent.slots)):
                    geoname = [value for key, value in match.referent.slots if key == 'NAME']
                    geotype = [value for key, value in match.referent.slots if key == 'TYPE']
                    if len(geoname) > 0:
                        if len(geotype) > 0:
                            newline = (row['id'], geotype[0], geoname[0], text) 
                            #here we select only 1st value from Pullenti Values, 
                            #because we don't want disambiguity
                result_pul.append(newline) 
    except AttributeError:
        not_found += 1
        continue
        

result_pul = pd.DataFrame(result_pul)
result_pul.columns = ["id_item", "Type", "Geo_name", "Initial_name"]

#Delete all NA if Pulenti did not understand that they are NA
indexNames = result_pul[result_pul["Geo_name"] == "NaN"].index
result_pul.drop(indexNames, inplace=True)

#make table with unque values
places_uniq = result_pul["Geo_name"].unique()

places_uniq = pd.DataFrame(places_uniq)
places_uniq = places_uniq.reset_index().rename(columns={'index': 'place_id', 0: 'Geo_name'})


##############UNIQUE VALUES TABLE OUT#######################
places_uniq.to_csv('unique_places.csv')

#match items ids with unique places ids
intermediate_table = pd.merge(result_pul, places_uniq, how='inner', on=['Geo_name'])
intermediate_table = intermediate_table[["id_item", "place_id"]] #убираем лишние колонки

##############INTERMEDIATE TABLE OUT#######################
intermediate_table.to_csv('places_intermediate.csv')
