In [None]:
import cx_Oracle
from os import getenv
import datetime, json
import os
from google.cloud import secretmanager

import numpy as np
import pandas as pd
from pandas.tseries.offsets import Day

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import warnings
warnings.filterwarnings('ignore')

In [None]:
def set_secrets_as_envs():
  secrets = secretmanager.SecretManagerServiceClient()
  resource_name = f"{os.environ['KNADA_TEAM_SECRET']}/versions/latest"
  secret = secrets.access_secret_version(name=resource_name)
  secret_str = secret.payload.data.decode('UTF-8')
  secrets = json.loads(secret_str)
  os.environ.update(secrets)

In [None]:
def oracle_secrets():
  set_secrets_as_envs()
  return dict(
    user=os.getenv("UID"),
    password=os.getenv("PWD"),
    host = os.getenv("HOST"),
    service = os.getenv("SERVICE"),
    encoding="UTF-8",
    nencoding="UTF-8"
  )

oracle_secrets = oracle_secrets()

In [None]:
def get_data(sql):
    user = oracle_secrets['user'] + '[DVH_FAM_EF]' #f"[{schema}]"
    dsn_tns = cx_Oracle.makedsn(oracle_secrets['host'], 1521, service_name = oracle_secrets['service'])
    try:
        conn = cx_Oracle.connect(user = user, password = oracle_secrets['password'], dsn = dsn_tns)
        df = pd.read_sql(sql = sql, con = conn)
        conn.close()
        return df
    except cx_Oracle.Error as error:
        print(error)

In [None]:
sql = """ 
select kafka_offset,
JSON_VALUE(melding, '$.vedtaksbegrunnelse') AS vedtaksbegrunnelse 
from dvh_fam_ef.fam_ef_meta_data 
where STONADSTYPE = 'SKOLEPENGER' and JSON_VALUE(melding, '$.vedtaksbegrunnelse') is not null
"""

df = get_data(sql = sql)

In [None]:
df = pd.DataFrame(df)

In [None]:
df.head()

In [None]:
df.loc[1, 'VEDTAKSBEGRUNNELSE']

In [None]:
# finner lengde på hver melding

df['Tekst_len'] = df['VEDTAKSBEGRUNNELSE'].str.len()

In [None]:
df['VEDTAKSBEGRUNNELSE'] = df['VEDTAKSBEGRUNNELSE'].str.lower()
df['VEDTAKSBEGRUNNELSE'] = df['VEDTAKSBEGRUNNELSE'].str.replace('\n', '')

In [None]:
df.head()

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
filt = (df['Tekst_len'] < 100)
df.loc[filt, 'VEDTAKSBEGRUNNELSE'].head()

In [None]:
stop_words = set(stopwords.words('norwegian'))
no_list = ['.', ',', 'til', 'kr', 'kroner','på', 'for'] + list(stop_words)

def delete_not_worthy_word(x):
    tokens = nltk.word_tokenize(x)
    cleaned_tokens = []
    for word in tokens:
        if word not in no_list:
            cleaned_tokens.append(word)      
    return ' '.join(cleaned_tokens)   

In [None]:
df['cleaned_text'] = df['VEDTAKSBEGRUNNELSE'].apply(delete_not_worthy_word)

In [None]:
from collections import Counter
Counter(" ".join(df["cleaned_text"]).split()).most_common(100)

In [None]:
filt = (df['cleaned_text'].str.contains('='))
df.loc[filt]

In [None]:
filt = (df['Tekst_len'] > 500)
df.loc[filt]

In [None]:
connection_string = 'username/pass@localhost/schema'
clob_column_name = 'MELDING'
query = """ SELECT 
pk_ef_meta_data, kafka_offset, {} 
FROM dvh_fam_ef.fam_ef_meta_data 
where STONADSTYPE = 'SKOLEPENGER' """.format(clob_column_name)

dsn_tns = cx_Oracle.makedsn(oracle_secrets['host'], 1521, service_name = oracle_secrets['service'])
with cx_Oracle.connect(user = oracle_secrets['user'], password = oracle_secrets['password'], dsn = dsn_tns) as connection:
    df = pd.read_sql(sql=query, 
                     con=connection)
    # need to read data from cx_Oracle.LOB objects
    df[clob_column_name] = df[clob_column_name].apply(lambda x: x.read())