In [105]:
#from ydata_profiling import ProfileReport
import pandas as pd
#from cleaner import data_cleaning
import requests
import json

In [106]:
df = pd.read_csv('../data/teknofest_train_final.csv', sep='|')

In [107]:
preprocess_url = "https://cryptic-oasis-68424.herokuapp.com/bulk_preprocess?turkish_char=false"
texts = df.text.values.tolist()
preprocess_response = requests.post(preprocess_url, json={"texts": texts})
processed_text = preprocess_response.json()['result']

In [108]:
df['text'] = processed_text

In [109]:
df.head()

Unnamed: 0,id,text,is_offensive,target
0,81c11060-a240-4d54-841b-9e2916039e85,curuk disli,1,INSULT
1,be80ebbf-b322-4c3b-afa1-94932ea80731,bu adamin islama ve muslumanlara verdigi zarar...,1,RACIST
2,f99e2513-83ed-4076-ac72-b9e2cff3f049,erkekler zora gelmez,1,SEXIST
3,83ed2b2e-b815-4f36-9fc4-80a9050cf2d0,utanmazin gotune kazik sokmuslar bu tikirti ne...,1,PROFANITY
4,d93e05f7-bfdd-4cdb-99d8-3048761b30ff,otomasyon sistemlerine dogrudan baglanabilir,0,OTHER


In [110]:
print(df[df.duplicated(subset='text')].count())

id              189
text             40
is_offensive    189
target          189
dtype: int64


In [111]:
df.drop_duplicates(subset='text', inplace=True)
print(df[df.duplicated(subset='text')].count())

id              0
text            0
is_offensive    0
target          0
dtype: int64


In [112]:
df.groupby(['target', 'is_offensive']).size().reset_index(name='count')

Unnamed: 0,target,is_offensive,count
0,INSULT,1,2393
1,OTHER,0,3511
2,OTHER,1,56
3,PROFANITY,1,2372
4,RACIST,0,1
5,RACIST,1,2016
6,SEXIST,1,2079


In [113]:
def remove_short_text(df: pd.DataFrame, min_len: int = 5) -> pd.DataFrame:
    """
    Remove observations from the input DataFrame with short text values based on a minimum length threshold.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame with the text column
    min_len : int, optional (default=5)
        The minimum length threshold for text values to be considered valid
        
    Returns
    -------
    pandas.DataFrame
        The modified DataFrame with the short text values removed
        
    Notes
    -----
    This function removes observations from the input DataFrame where the length of the text value is less than the
    specified minimum length threshold. The function first identifies the indexes of the observations with short text
    values based on the minimum length threshold. Then, the function drops those observations from the input DataFrame.
    """

    result = [index for index, i in enumerate(df.text) if len(str(i)) < min_len]
    df.drop(df.index[result], inplace=True)
    return df


In [114]:
remove_short_text(df)

Unnamed: 0,id,text,is_offensive,target
0,81c11060-a240-4d54-841b-9e2916039e85,curuk disli,1,INSULT
1,be80ebbf-b322-4c3b-afa1-94932ea80731,bu adamin islama ve muslumanlara verdigi zarar...,1,RACIST
2,f99e2513-83ed-4076-ac72-b9e2cff3f049,erkekler zora gelmez,1,SEXIST
3,83ed2b2e-b815-4f36-9fc4-80a9050cf2d0,utanmazin gotune kazik sokmuslar bu tikirti ne...,1,PROFANITY
4,d93e05f7-bfdd-4cdb-99d8-3048761b30ff,otomasyon sistemlerine dogrudan baglanabilir,0,OTHER
...,...,...,...,...
12612,71eedfa1-8fa6-425c-b982-258c3b29c003,uyuma taklidi yapan tehlikeli bir hayvanin goz...,0,OTHER
12613,b38eed16-6501-4563-8b33-ff2e634bb8e5,yolda at kavga eden uc oglan cocugu gorur,0,OTHER
12614,c8a051a8-94ef-4b64-a48e-54d0fa4f8323,sizin kopeklerinizin burnu bile daha iyi koku ...,0,OTHER
12615,513a7e6d-4207-4a16-9b47-972f26e23cfe,hayalleri gercek etmek icin birisinin delilik ...,0,OTHER


In [115]:
df.groupby(['target', 'is_offensive']).size().reset_index(name='count')

Unnamed: 0,target,is_offensive,count
0,INSULT,1,2393
1,OTHER,0,3511
2,OTHER,1,56
3,PROFANITY,1,2372
4,RACIST,1,2016
5,SEXIST,1,2079


In [116]:
def replace_is_offensive(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replace the value of 'is_offensive' from 1 to 0 for the observation units that meet the following criteria:
    
    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame that contains the 'target' and 'is_offensive' columns
    
    Returns
    -------
    pandas.DataFrame
        The modified DataFrame with the 'is_offensive' values replaced
        
    Notes
    -----
    This function modifies the input DataFrame by replacing the 'is_offensive' values that meet the criteria.
    The function replaces 'is_offensive' values from 1 to 0 where the 'target' column is 'OTHER' and the 'is_offensive' column is 1.
    """
    idx = df.loc[((df["target"] == "OTHER") & (df["is_offensive"] == 1))].index
    df.loc[idx, "is_offensive"] = 0
    return df

replace_is_offensive(df)

Unnamed: 0,id,text,is_offensive,target
0,81c11060-a240-4d54-841b-9e2916039e85,curuk disli,1,INSULT
1,be80ebbf-b322-4c3b-afa1-94932ea80731,bu adamin islama ve muslumanlara verdigi zarar...,1,RACIST
2,f99e2513-83ed-4076-ac72-b9e2cff3f049,erkekler zora gelmez,1,SEXIST
3,83ed2b2e-b815-4f36-9fc4-80a9050cf2d0,utanmazin gotune kazik sokmuslar bu tikirti ne...,1,PROFANITY
4,d93e05f7-bfdd-4cdb-99d8-3048761b30ff,otomasyon sistemlerine dogrudan baglanabilir,0,OTHER
...,...,...,...,...
12612,71eedfa1-8fa6-425c-b982-258c3b29c003,uyuma taklidi yapan tehlikeli bir hayvanin goz...,0,OTHER
12613,b38eed16-6501-4563-8b33-ff2e634bb8e5,yolda at kavga eden uc oglan cocugu gorur,0,OTHER
12614,c8a051a8-94ef-4b64-a48e-54d0fa4f8323,sizin kopeklerinizin burnu bile daha iyi koku ...,0,OTHER
12615,513a7e6d-4207-4a16-9b47-972f26e23cfe,hayalleri gercek etmek icin birisinin delilik ...,0,OTHER


In [117]:
df.groupby(['target', 'is_offensive']).size().reset_index(name='count')

Unnamed: 0,target,is_offensive,count
0,INSULT,1,2393
1,OTHER,0,3567
2,PROFANITY,1,2372
3,RACIST,1,2016
4,SEXIST,1,2079


In [118]:
df.to_csv('../data/obs_clean_data.csv', index=False)

In [101]:
"""profile = ProfileReport(df, title="Pandas Profiling Report")
profile"""

'profile = ProfileReport(df, title="Pandas Profiling Report")\nprofile'

In [102]:
#profile.to_file("data/data_report.html")

In [103]:
#Cinsiyetçilik(SEXIST)	Irkçılık(RACIST)	Kızdırma(INSULT)&küfür(PROFANITY)	Nötr(OTHER)

In [17]:
df2022=pd.read_csv('../data/veriseti_toplu - veriseti_toplu.csv')
df_new= pd.DataFrame()
df_new['text'] = df2022['text']
df_new['target'] = df2022['label']
df_new.to_csv('../data/veriseti_toplu.csv')

In [18]:
df = pd.read_csv('../data/veriseti_toplu.csv')


In [20]:
df = df[df['target']=='Kızdırma']
df.to_csv('../data/kızdırma.csv')