In [213]:
import pandas as pd
import psycopg2
from datetime import datetime 
import numpy as np

import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mendgaziev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Выбор параметров

In [936]:
path = r'C:\Users\mendgaziev\Desktop\river_flows'

In [937]:
file_name = r'дислокация нефтеналивных судов 2021 г'

In [938]:
year = '2021'

In [1128]:
execute = False

## Function

In [939]:
def prognoz_query(query):
    try:
        conn = psycopg2.connect(dbname='', user='', password='', host='',port=5432)
        df = pd.read_sql(query, conn)
        return df
    except(Exception, psycopg2.Error) as error:
        print('Error read sql: ', error)
    finally:
        if conn:
            conn.close()     

In [1116]:
def read_xlsx(path, file_name):
    path = path.replace("\\", "/")
    df = pd.read_excel(path + '/' + file_name + '.xlsx', dtype=str)
    df = df.astype(str)
    df.columns = ['direction', 'month', 'river_basin_out', 'river_basin_in', 'vessel', 'prod_type', 'product', 'value',
                     'source_port', 'date_out', 'target_port', 'date_in']
    df.direction = df.direction.str.replace('nan', 'внутренний рынок')
    df.value = df.value.str.replace('nan', '0').str.replace('балласт', '0')
    df.value = df.value.str.replace(' ', '').str.replace('(', '.').str.replace(')', '').str.replace(',', '.').str.strip()
    
    return df

In [941]:
def basic_form_str(df, *columns):
    for column in columns:
        df[column] = df[column].apply(lambda x: str(x).lower().capitalize()).apply(lambda x: str(x).strip())
        df[column] = df[column].str.replace('"', '')
        return df

In [942]:
def max_id(txt):
    from math import isnan
    conn = psycopg2.connect(dbname='', user='', password='', host='',port=5432)
    query = f"select max(id) from { txt }"
    max = pd.read_sql(query,conn)
    max_id = max['max'].max()
    if max_id == 'nan' or isnan(max_id):
        max_id = 0
    conn.close()
    
    return max_id

In [943]:
def get_fullnames_port(port):
    ports = port.copy()
    df = pd.DataFrame()
    for i, r in ports.iterrows():
        id = r['id']
        if r['fullnames'] != None:
            for fulln_port in r['fullnames']:
                df2 = pd.DataFrame({'id':[id], 'port': [fulln_port]})
                df = pd.concat([df, df2])
    df['port'] = df['port'].str.lower()
    df = df.drop_duplicates()
    
    return df

In [944]:
def find_id_source(river, port):
    ports = port.copy()
    ports.rename(columns={'name':'source_port'}, inplace=True)
    #ports = ports[ports.ismarine != 'True']
    ports = basic_form_str(ports, 'source_port')
    river = basic_form_str(river, 'source_port')
    df = river.merge(ports[['id', 'source_port']], on='source_port', how = 'left').astype(str)
    df = df.drop_duplicates(subset=['direction', 'month', 'river_basin_out', 'river_basin_in', 'vessel', 'prod_type', 
                                    'product', 'value', 'source_port', 'date_out', 'target_port', 'date_in'], keep='first')
    
    df_with_id = df[df.id != 'nan']
    df_without_id = df[df.id == 'nan'].loc[:, df.columns != 'id']
    df_without_id = basic_form_str(df_without_id, 'source_port')
    
    fullnames_port = get_fullnames_port(ports)
    fullnames_port.rename(columns={'port':'source_port'}, inplace=True)
    fullnames_port = basic_form_str(fullnames_port, 'source_port')
    
    df_fulln_id = df_without_id.merge(fullnames_port, on='source_port', how='left').astype(str)
    
    df_id_source = pd.concat([df_with_id, df_fulln_id])
    df_id_source.rename(columns={'id':'source'}, inplace=True)
    df_id_source['source'] = df_id_source['source'].replace('nan', '-1')
    #df_id_source['source'] = df_id_source['source'].astype(float).astype(int)
    
    return df_id_source

In [945]:
def find_id_target(river, port):
    ports = port.copy()
    ports.rename(columns={'name':'target_port'}, inplace=True)
    ports = basic_form_str(ports, 'target_port')
    river = basic_form_str(river, 'target_port')
    
    df = river.merge(ports[['id', 'target_port']], on='target_port', how = 'left').astype(str)
    df = df.drop_duplicates(subset=['direction', 'month', 'river_basin_out', 'river_basin_in', 'vessel', 'prod_type', 
                                    'product', 'value', 'source_port', 'date_out', 'target_port', 'date_in'], keep='first')
    
    df_with_id = df[df.id != 'nan']
    df_without_id = df[df.id == 'nan'].loc[:, df.columns != 'id']
    df_without_id = basic_form_str(df_without_id, 'target_port')
    
    fullnames_port = get_fullnames_port(ports)
    fullnames_port.rename(columns={'port':'target_port'}, inplace=True)
    fullnames_port = basic_form_str(fullnames_port, 'target_port')
    
    df_fulln_id = df_without_id.merge(fullnames_port, on='target_port', how='left').astype(str)
    
    df_id_target = pd.concat([df_with_id, df_fulln_id])
    df_id_target.rename(columns={'id':'target'}, inplace=True)
    df_id_target['target'] = df_id_target['target'].replace('nan', '-1')
    #df_id_target['target'] = df_id_target['target'].astype(float).astype(int)
    
    return df_id_target

In [946]:
def find_id_port(river, port):
    df_source_id = find_id_source(river, port)
    df_id_port = find_id_target(df_source_id, port).reset_index(drop=True)
    return df_id_port

In [947]:
months = {'январь' : '01',
        'февраль' : '02',
        'март' : '03',
        'апрель' : '04',
        'май' : '05',
        'июнь' : '06',
        'июль' : '07',
        'август' : '08',
        'сентябрь' : '09',
        'октябрь' : '10',
        'ноябрь' : '11',
        'декабрь' : '12'}

In [948]:
def formate_date(year, df, *columns):
    df['month'] = df['month'].str.lower().str.strip()
    df['month_numb'] = df['month'].str.lower().str.strip().apply(lambda x: months[x])
    df['first_day_month'] = df['month_numb'].apply(lambda x: datetime.strptime(x+'-'+year, '%m-%Y').strftime('%Y-%m-%d'))
    #df['last_day_month'] = df['month_numb'].apply(lambda x: datetime.strptime(x+'-'+year+'-30', '%m-%Y-%d').strftime('%Y-%m-%d'))
    
    for column in columns:
        #print(df[column])
        #df[column] = df[column].replace('nan', '1900-01-01')
        #df[column] = df[column].str.replace('с', '').str.replace('г.', '').str.replace('по', '').str.strip()
        #print(df[column])
        df[column] = pd.to_datetime(df[column], errors='coerce')
        #print(df[column])
        df[column] = df[column].apply(lambda x: x.date())
        #print(df[column])
        #df[column] = df[column].fillna(df['first_day_month'])
        df[column] = df[column].astype(str)
           
        if column == 'date_out':
            for i,r in df.iterrows():
                if (r[column][:4] != year) | (r[column][5:7] != r['month_numb']):
                    r[column] = r[column].replace(r[column], r['first_day_month'])
        else:
            for i,r in df.iterrows():
                if (r[column][:4] != year) | (r[column][5:7] != r['month_numb']):
                    r[column] = r[column].replace(r[column], '1900-01-01')
                    
    return df

In [949]:
def text_process(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('russian')]

    return " ".join(text)

In [950]:
def find_id_product(df, column):
    product_patterns_classif = prognoz_query("select pattern, product from product_patterns_classif where class!=0")
    product_patterns_classif['pattern'] = product_patterns_classif['pattern'].apply(text_process)

    text = pd.DataFrame(product_patterns_classif['pattern'])
    label = product_patterns_classif['product']

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit(product_patterns_classif['pattern'])
    features = vectors.transform(product_patterns_classif['pattern'])

    gbc = GradientBoostingClassifier(n_estimators=200, random_state=111, max_depth = 6, learning_rate=0.01)
    gbc.fit(features, product_patterns_classif['product'])

    # присваем тип продукта
    grade = df[[column]].drop_duplicates()
    X_data = vectors.transform(grade[column])
    y_pred = gbc.predict(X_data)
    grade['product_code'] = list(y_pred)
    df = df.merge(grade, on = [column], how = 'left')
    return df

In [951]:
def max_id(txt):
    from math import isnan
    conn = psycopg2.connect(dbname='prognoz', user='prognoz', password='prognoz', host='192.168.245.50',port=5432)
    query = f"select max(id) from { txt }"
    max = pd.read_sql(query,conn)
    max_id = max['max'].max()
    if max_id == 'nan' or isnan(max_id):
        max_id = 0
    conn.close()
    
    return max_id

In [1097]:
def execute_sql(df):
    conn = psycopg2.connect(dbname='', user='', password='', host='',port=5432)
    curs = conn.cursor()
    for i, r in df.iterrows():    
        try:
            insert = """INSERT INTO river_flows(date_out, source, target, value, product_code, vessel, date_in, id, direction,
            river_basin_out, river_basin_in, product, source_port, target_port) 
                        VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" 
            data_tuple = (r['date_out'], r['source'], r['target'], r['value'], r['product_code'], r['vessel'], 
            r['date_in'], r['id'], r['direction'], r['river_basin_out'], r['river_basin_in'], r['product'], r['source_port'], 
                          r['target_port']) 
        except:
            continue
        else:
            curs.execute(insert, data_tuple)
            print(insert)

    conn.commit()
    print("закончил")
    curs.close()
    conn.close()

## Start

In [1117]:
river_flows = read_xlsx(path, file_name)

  df.value = df.value.str.replace(' ', '').str.replace('(', '.').str.replace(')', '').str.replace(',', '.').str.strip()


In [1118]:
ports = prognoz_query("select * from ports")



In [1119]:
df_river_flows = find_id_port(river_flows, ports)

In [1120]:
df_river_flows = formate_date(year, df_river_flows, 'date_out', 'date_in')

  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

In [1121]:
df_river_flows = find_id_product(df_river_flows, 'product')



In [1122]:
df_river_flows['source'] = df_river_flows['source'].astype(float).astype(int)
df_river_flows['target'] = df_river_flows['target'].astype(float).astype(int)

In [1123]:
# выгрузка макс id
maxid_river_flows = max_id('river_flows')
df_river_flows.index = np.arange(maxid_river_flows + 1, maxid_river_flows + 1 + len(df_river_flows))
df_river_flows = df_river_flows.reset_index()
df_river_flows.rename(columns = {'index':'id'}, inplace=True)



In [1124]:
df_river_flows

Unnamed: 0,id,direction,month,river_basin_out,river_basin_in,vessel,prod_type,product,value,source_port,date_out,target_port,date_in,source,target,month_numb,first_day_month,product_code
0,9180,внутренний рынок,апрель,Камский,Камский,Баржа Караидель,нефтепродукты,мазут,4070,Уфа,2021-04-23,Нагаевский рейд,2021-04-24,69,580,04,2021-04-01,28
1,9181,внутренний рынок,апрель,Камский,Камский,Вымпел-1,нефтепродукты,дизтопливо,46,Бетьки,2021-04-27,Н. челны,2021-04-27,637,779,04,2021-04-01,62
2,9182,внутренний рынок,апрель,Камский,Камский,Вымпел-1,нефтепродукты,дизтопливо,24,Н. челны,2021-04-27,Бетьки,2021-04-27,779,637,04,2021-04-01,62
3,9183,внутренний рынок,апрель,Камский,Камский,Бельская-68,нефтепродукты,мазут,4033,Исмайлово,2021-04-30,Нагаевский рейд,2021-04-30,708,580,04,2021-04-01,28
4,9184,внутренний рынок,апрель,Волжский,Азово-Донской,Japetus,нефтепродукты,Газойль,3981,Саратов-груз.,2021-04-01,Кавказ,1900-01-01,862,87,04,2021-04-01,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5683,14863,экспорт,ноябрь,Волжский,Азово-Донской,Пенелопа,нефтепродукты,Бензин,4426,Татьянка ii,2021-11-11,Грузия,1900-01-01,532,-1,11,2021-11-01,60
5684,14864,транзит,ноябрь,Волжский,Азово-Донской,ВФ Танкер-22,нефтепродукты,Диз. топливо,4707,Туркменистан,2021-11-15,Турция,1900-01-01,-1,-1,11,2021-11-01,62
5685,14865,транзит,ноябрь,Волжский,Азово-Донской,Балтфлот-20,нефтепродукты,Мазут,4410,Туркменистан,2021-11-15,Турция,1900-01-01,-1,-1,11,2021-11-01,28
5686,14866,транзит,ноябрь,Волжский,Азово-Донской,ВФ Танкер-7,нефтепродукты,Диз. топливо,4602,Туркменистан,2021-11-21,Турция,1900-01-01,-1,-1,11,2021-11-01,62


In [1094]:
df_river_flows[df_river_flows.target == -1]['target_port'].sort_values().unique()

array(['105 колыма', '110 лена', '1157 лена', '15 лена', '1520 лена',
       '204 км реки витим', '25 лена', '2505 лена', '291 км реки витим',
       '3124 км', '375 лена', '451 алдан', '469 индигирка', '688 вилюй',
       '738 вилюй', '960 лена', 'Nan', 'Азербайджан', 'Актаныш', 'Б. яры',
       'Б.яры', 'Белоусово', 'Беляевка', 'Болгария',
       'Борский канал (з-н к.маркса)', 'Вичелово (505,00 км (63 с/х))',
       'Газсале', 'Галактионово', 'Грахань (устье р.вятки)', 'Греция',
       'Грузия', 'Давыдова', 'Давыдово', 'Девятое января', 'Займище',
       'Зотино', 'Зуевы ключи', 'Италия', 'Казарки', 'Караул', 'Каспий',
       'Колесниково', 'Коровий перекат', 'Ленек', 'М. наливной',
       'Михайловка', 'Мордово (убежище)', 'Морские порты края', 'Н-порт',
       'Наречи', 'Нов. ладога', 'Новоназимово', 'Новый пункт',
       'О.золотой осередок', 'Октябрьское', 'П. вертикос', 'П. гыда',
       'П. тазовский', 'Понтонный', 'Р.печора 920', 'Ростов', 'Румыния',
       'Се-яха', 'Семаков

In [1082]:
get_fulln[get_fulln.port.str.lower().str.contains("чере")]

Unnamed: 0,id,port
0,72,череповец


In [1126]:
df_river_flows[df_river_flows.value == '38.8']

Unnamed: 0,id,direction,month,river_basin_out,river_basin_in,vessel,prod_type,product,value,source_port,date_out,target_port,date_in,source,target,month_numb,first_day_month,product_code
902,10082,внутренний рынок,июнь,Ленский,Ленский,Байкал,нефтепродукты,тс-1,38.8,Осетрово,2021-06-01,Ленск,1900-01-01,537,757,6,2021-06-01,30


In [1129]:
if execute:
    execute_sql(df_river_flows)