In [23]:
import pandas as pd
from typing import Dict, List, Optional
from dataclasses import dataclass, field
import csv

In [30]:
@dataclass
class Example:
    text: str
    label: int



def load_train_data(path: str, sort: bool) -> List[Example]:
    sentences = list()
    with open(path) as f:
        first = False
        for line in f:
            if not first:
                first = True
                continue
            text1, text2, label = line.rstrip().split("|")
            concat_text = text1+'. '+text2
            lab = len(concat_text)
            sentences.append((lab, Example(text=concat_text, label=float(label))))
    if sort:
        sentences.sort(key=lambda x: x[0])

    return [e for (_, e) in sentences]

SyntaxError: invalid syntax (<ipython-input-30-3c6ca677768e>, line 20)

In [31]:
examples = load_train_data('data/train_test/train.csv', sort=False)

In [32]:
examples[0]

Example(text='sonde attain performa. le guide est resté coincé à lintérieur de la sonde, on ne peut plus le bouger. changement de sonde', label='2851.0')

In [22]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
import re
import string
from sklearn.preprocessing import LabelEncoder

# Constructed with analysis of embedding coverage
misspell_dict = {"cest":"c'est",
                "desaturation": "désaturation",
                "controlaterale":"controlatérale",
                '\x9cdème':"oedème",
                '\x9cdèmes':"oedèmes",
                'léchographie': "échographie",
                "desaturation":"désaturation",
                '\x9cil':"oeil",
                "limplant":"implant",
                "hemorragiques":"hémorragiques",
                "hypoglycemie":"hypoglycémie",
                "lautomate" : 'automate',
                'l\x9cil' : 'oeil',
                'adenopathie':'adénopathie',
                'prothetique':'prothétique',
                'inapproprie': "inapproprié",
                'lartère':'artère',
                'asthenie':'asthénie',
                'man\x9cuvre': 'manoeuvre',
                'lexplantation': 'explantation',
                'lymphoree':'lymphorée',
                'salpyngectomie':'salpingectomie',
                'burnaout':'burnout',
                'lnterventlon': 'intervention',
                'pericardique': 'péricardique',
                'lendométriose':'endométriose',
                'daudition': 'audition',
                'désaltére': 'désaltéré',
                'cephalee':'céphalée',
                'salpaginctomie': 'salpingectomie',
                'menauposée':'ménopausée',
                'deczéma':'eczéma',
                'peritonite': 'péritonite',
                'lablation':'ablation',
                'microjyste': 'microkyste',
                'généralié': 'généralité',
                'débriété': 'ébriété',
                'acidocetose': 'acidocétose',
                'dhéparine':'héparine',
                'dincident':'incident',
                'daiguille':'aiguille',
                'materiovigilance':'matériovigilance',
                'adenomyose': 'adénomyose'
                    }


def replace_typical_misspell(text: str) -> str:
    misspell_re = re.compile('(%s)' % '|'.join(misspell_dict.keys()))

    def replace(match):
        return misspell_dict[match.group(0)]

    return misspell_re.sub(replace, text)


puncts = ['"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©',
          '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█',
          '½', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶',
          '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼',
          '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲',
          '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

bad_char = ['\x85']

carriage = [r"\\t|\\n|\\r", "\t|\n|\r", "\r\r\n"]


def clean_text(text: str) -> str:

    text = str(text)
    text = text.replace('\x9c', 'oe')
    text = text.replace('\x92', "'")
    for carr in carriage:
        if carr in text:
            text = text.replace(carr, ' ')
    for bc in bad_char:
        if bc in text:
            text = text.replace(bc, '')
    for punct in puncts:
         if punct in text:
            text = text.replace(punct, '')

    return text


def clean_numbers(text: str) -> str:
    return re.sub(r'\d+', '', text)


def preprocess_text(text: str) -> str:
    if isinstance(text, str):
        text = text.lower()
        text = clean_text(text)
        text = clean_numbers(text)
        text = replace_typical_misspell(text)
        text = text.strip()
        text = re.sub(' +', ' ', text)

    else:
        text = ""
    return text

# Create train and test


X_cols = ['NUMERO_DECLARATION','LIBELLE_COMMERCIAL','DESCRIPTION_INCIDENT', 'ETAT_PATIENT', 'FABRICANT', 'DISTRIBUTEUR']
y_col = ['GRAVITE']

df = pd.read_csv('data/declaration_mrv.csv', sep=";", encoding='latin1')


# Drop all nan values in the target

df = df.dropna(subset=y_col)
print(df.shape)

# Fill na values in text fiels with vide

df['DESCRIPTION_INCIDENT'] = df['DESCRIPTION_INCIDENT'].fillna("")
df['ETAT_PATIENT'] = df['TYPE_EFFET'].fillna("")
df['LIBELLE_COMMERCIAL'] = df['LIBELLE_COMMERCIAL'].fillna("")
df['FABRICANT'] = df['FABRICANT'].fillna("")
df['DISTRIBUTEUR'] = df['DISTRIBUTEUR'].fillna("")


df_subset['produit'] = df_subset['LIBELLE_COMMERCIAL'] 
df_subset['incident'] = df_subset['DESCRIPTION_INCIDENT']+' '+df_subset['ETAT_PATIENT']

df_subset = df_subset.drop_duplicates(subset='incident')

# Encode label
le = LabelEncoder()

#df_subset[y_col] = le.fit_transform(df_subset[y_col].values)


print("max",df_subset[y_col].max())
print("min",df_subset[y_col].min())


train_index, test_index = next(GroupShuffleSplit(random_state=1029).split(df_subset, groups=df_subset['NUMERO_DECLARATION']))

train = df_subset.iloc[train_index]
test = df_subset.iloc[test_index]

print(train['GRAVITE'].value_counts(normalize=False))
print(test['GRAVITE'].value_counts(normalize=True))
# Clean
text_columns = ['produit', 'incident']
train.loc[:, text_columns] = train.loc[:, text_columns].applymap(preprocess_text)
test.loc[:, text_columns] = test.loc[:, text_columns].applymap(preprocess_text)

train = train[['produit', 'incident', 'GRAVITE']]
test = test[['produit', 'incident', 'GRAVITE']]

print(train.head())
train.to_csv('data/train_test/train_gravite.csv', index=False, sep='|')
test.to_csv('data/train_test/test_gravite.csv', index=False, sep='|')

(74019, 24)
max GRAVITE    SEVER
dtype: object
min GRAVITE    CRITI
dtype: object
SEVER    19978
MOYEN    10466
MINEU     4894
CRITI     1138
NULLE      340
Name: GRAVITE, dtype: int64
SEVER    0.549149
MOYEN    0.283299
MINEU    0.126585
CRITI    0.029587
NULLE    0.011380
Name: GRAVITE, dtype: float64
                                             produit  ... GRAVITE
0                              sonde attain performa  ...   MOYEN
1              tensoban bande de protection adhesive  ...   MOYEN
2                        sphincterotome dreamtome rx  ...   NULLE
5          pince kocher dans la trousse accouchement  ...   MOYEN
9  tubulure pack avec enfit pour pompe flocare in...  ...   MINEU

[5 rows x 3 columns]


In [12]:
print(train.shape)

(36816, 3)


In [23]:
1. / 348

0.0028735632183908046