In [11]:
import pandas as pd
from src.swinno_helpers import split_cols
from src.swinno_helpers import melt_table


In [12]:
tags_221222 = pd.read_csv("../data/raw-data/221222-tagged-innovations.csv")
tags_221222.columns = tags_221222.columns.str.strip().str.lower().str.replace(" ", "_")
tags_221222.head()


Unnamed: 0,sinno_id,use_sectors,innovation_name_in_swedish,description_in_swedish,additional_info,innovation_type,bioeconomy_vision,notes
0,6000001.0,20400.0,Ensocoat/Ansogloss/Tambrite,Ny typ av kartongpapper i tre varianter avsedd...,,0,0,check article
1,6039001.0,20.0,WEN,Pneumatisk graverpenna. Drivenheten har inga a...,,90,90,check article
2,6068001.0,17.0,TEX VPS,Världens första maskin för vakuumförpackning a...,,0,9,not forest
3,6100001.0,13.0,,Skrotdetektor baserad på induktion av virvelst...,,9,9,not forest
4,6369001.0,13.0,,System för automatisk styrning av förarlösa fo...,,601501,9,not forest


In [17]:
def clean_codes(input_df, code_digits, col):
    old = "(\d{code_digits})(?!\s+$)"
    new = "$1,"
    output_df = input_df[col].str.replace(old, new)
    return output_df

## Check for Duplicates

In [20]:
def check_duplicates(df, subset='sinno_id'):
    duplicates = df.loc[df.duplicated(subset=subset, keep=False),:]
    duplicates = duplicates.sort_values(by=subset)

    if duplicates.empty:
        print('No duplicates found.')
    else:
        print(f'{len(duplicates)} duplicates found.')
        # df.to_csv(Path(ROOT, "data", "modified-data", f'{df}-duplicates.csv'))
        print(f'{df} written to csv.')

In [21]:
check_duplicates(tags_221222)

46 duplicates found.
       sinno_id  use_sectors   innovation_name_in_swedish  \
0     6000001.0      20400.0  Ensocoat/Ansogloss/Tambrite   
1     6039001.0         20.0                          WEN   
2     6068001.0         17.0                      TEX VPS   
3     6100001.0         13.0                          NaN   
4     6369001.0         13.0                          NaN   
..          ...          ...                          ...   
310  13013001.0        201.0                   Cambio 600   
311  13014001.0      20101.0                 Eurosaw S6 D   
312  13016001.0      20101.0                  Catech 6000   
313  13017001.0      20510.0                          NaN   
314         NaN          NaN                          NaN   

                                description_in_swedish additional_info  \
0    Ny typ av kartongpapper i tre varianter avsedd...             NaN   
1    Pneumatisk graverpenna. Drivenheten har inga a...             NaN   
2    Världens första mas

In [13]:
duplicates = tags_221222.loc[tags_221222.duplicated(subset="sinno_id", keep=False), :]
duplicates = duplicates.sort_values(by="sinno_id")
len(duplicates)



46

There are 46 duplicates in this dataset.
The majority is non forest.

While some innovations have received at least one differing code, the overwhelming majority has been tagged as close enough.

There is also a possibility that the introduction of some codes later in the process influenced the categorization.
Need to check this up

## Prepare DB Entry

In [14]:
innovation_types = tags_221222[["sinno_id", "innovation_type"]]
visions = tags_221222[["sinno_id", "bioeconomy_vision"]]
notes = tags_221222[["sinno_id", "notes"]]


### Innovation Types

In [None]:
split_innovation_types = split_cols(innovation_types, "innovation_type", sep=",")


In [None]:
melted_innovation_types = melt_table(
    split_innovation_types, "sinno_id", "innovation", "innovation_type"
).dropna()


In [None]:
melted_innovation_types.head()


In [None]:
melted_innovation_types["sinno_id"] = melted_innovation_types["sinno_id"].astype(int)


### Bioeconomy Visions

In [None]:
visions.columns


In [None]:
split_visions = split_cols(visions, col_to_split="bioeconomy_vision", sep=",")
melted_visions = melt_table(
    split_visions, id_vars="sinno_id", col_start="bio", value_name="bioeconomy_vision"
)
melted_visions.dropna(inplace=True)
melted_visions["sinno_id"] = melted_visions["sinno_id"].astype(int)


## Export


In [None]:
notes.to_csv("../data/modified-data/22122-notes.csv")
melted_visions.to_csv("../data/modified-data/221222-visions.csv")
melted_innovation_types.to_csv("../data/modified-data/221222-innovation-types.csv")


In [None]:
from pathlib import Path
from sqlalchemy import create_engine
database_dir = Path.cwd().parent.parent
database_uri = f'sqlite:///{database_dir}/swinno.db'


In [None]:
engine = create_engine(database_uri)

In [None]:
from src.swinno_helpers import connect_swinno_db

eninge = connect_swinno_db()

In [None]:
melted_innovation_types.to_sql(name='eco_innovations', con=engine, if_exists='append', index=False)


NameError: name 'melted_innovation_types' is not defined

In [None]:
notes.to_sql(name='classification_notes', con=engine, if_exists='append', index=False)

In [None]:
melted_visions.to_sql(name='bioeconomy_vision', con=engine, if_exists='append', index=False)