In [20]:
import pandas as pd

# Load the dataset
file_path = 'dataset.csv'
df = pd.read_csv(file_path)
df.dropna(subset=['difficult_words'], inplace=True)

# Display the first few rows of the dataset to understand its structure
df.head()

Unnamed: 0,par_id,paragraph,has_entity,lexicon_count,difficult_words,last_editor_gender,category,text_clarity
0,428209002237,Ramsay was born in Glasgow on 2 October 1852. ...,ORG_YES_PRODUCT_NO_PERSON_YES_,49,12.0,man,biographies,clear_enough
1,564218010072,It has been widely estimated for at least the ...,ORG_YES_PRODUCT_NO_PERSON_NO_,166,47.0,man,artificial intelligence,not_clear_enough
2,291401001672,He went on to win the Royal Medal of the Royal...,ORG_YES_PRODUCT_NO_PERSON_NO_,69,18.0,non-binary,biographies,clear_enough
3,31548004883,The changes have altered many underlying assum...,ORG_NO_PRODUCT_YES_PERSON_NO_,76,27.0,non-binary,programming,clear_enough
4,50634005146,"After these novels were published, Disraeli de...",ORG_YES_PRODUCT_YES_PERSON_YES_,200,47.0,man,biographies,not_clear_enough


In [21]:
# Check the number of unlabeled data points in the "text_clarity" column
unlabeled_df = df[df['text_clarity'].isna()]

# Count of unlabeled data points
unlabeled_count = unlabeled_df.shape[0]
unlabeled_count

9320

In [22]:
def label_clarity(row):
    # Simplified criteria for demonstration
    # Avoid dividing by zero - gadgets don't like being put into existential crises
    if row['lexicon_count'] == 0:
        return 'df_says_what'  # Or any label that suits your needs for this scenario
    # If the paragraph contains more than 15% difficult words, it might be 'not_clear_enough'
    if row['difficult_words'] / row['lexicon_count'] > 0.15:
        return 'not_clear_enough'
    else:
        return 'clear_enough'

# Apply the labeling function to the first 100 unlabeled data points
unlabeled_df['text_clarity'] = unlabeled_df.apply(label_clarity, axis=1)

# Check the first few labeled rows to ensure the process is working as expected
unlabeled_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_df['text_clarity'] = unlabeled_df.apply(label_clarity, axis=1)


Unnamed: 0,par_id,paragraph,has_entity,lexicon_count,difficult_words,last_editor_gender,category,text_clarity
9,79225005859,Pepys' diary provides a first-hand account of ...,ORG_NO_PRODUCT_YES_PERSON_YES_,139,31.0,man,biographies,not_clear_enough
10,959908012329,"Looking ahead, the current progress in data-to...",ORG_NO_PRODUCT_NO_PERSON_NO_,70,22.0,woman,artificial intelligence,not_clear_enough
11,123020006718,"Real-time programs such as simulations, flight...",ORG_NO_PRODUCT_NO_PERSON_NO_,84,38.0,man,programming,not_clear_enough
12,113961006578,"Darwin was the inventor of several devices, th...",ORG_NO_PRODUCT_NO_PERSON_YES_,35,8.0,woman,biographies,not_clear_enough
13,8471004175,He was among the first British troops into bo...,ORG_YES_PRODUCT_NO_PERSON_YES_,214,46.0,man,biographies,not_clear_enough


In [23]:
# Update the labeling function to handle cases with zero lexicon count
def label_clarity_corrected(row):
    # If lexicon count is 0, avoid division by zero by defaulting to 'clear_enough'
    if row['lexicon_count'] == 0:
        return 'clear_enough'
    # Apply the original criterion with a safeguard against division by zero
    elif row['difficult_words'] / row['lexicon_count'] > 0.15:
        return 'not_clear_enough'
    else:
        return 'clear_enough'

# Reapply the labeling function with the corrected logic
unlabeled_df['text_clarity'] = unlabeled_df.apply(label_clarity_corrected, axis=1)

# Check the first few labeled rows to ensure the process is working as expected
unlabeled_df.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_df['text_clarity'] = unlabeled_df.apply(label_clarity_corrected, axis=1)


Unnamed: 0,par_id,paragraph,has_entity,lexicon_count,difficult_words,last_editor_gender,category,text_clarity
9,79225005859,Pepys' diary provides a first-hand account of ...,ORG_NO_PRODUCT_YES_PERSON_YES_,139,31.0,man,biographies,not_clear_enough
10,959908012329,"Looking ahead, the current progress in data-to...",ORG_NO_PRODUCT_NO_PERSON_NO_,70,22.0,woman,artificial intelligence,not_clear_enough
11,123020006718,"Real-time programs such as simulations, flight...",ORG_NO_PRODUCT_NO_PERSON_NO_,84,38.0,man,programming,not_clear_enough
12,113961006578,"Darwin was the inventor of several devices, th...",ORG_NO_PRODUCT_NO_PERSON_YES_,35,8.0,woman,biographies,not_clear_enough
13,8471004175,He was among the first British troops into bo...,ORG_YES_PRODUCT_NO_PERSON_YES_,214,46.0,man,biographies,not_clear_enough
...,...,...,...,...,...,...,...,...
105,220520007855,"Here, he appears to give his support to the a...",ORG_YES_PRODUCT_YES_PERSON_YES_,190,51.0,man,philosophy,not_clear_enough
106,83427000844,Several revisions of the language have appeare...,ORG_NO_PRODUCT_NO_PERSON_NO_,37,8.0,man,programming,not_clear_enough
107,430941009538,He was an invited speaker in the International...,ORG_YES_PRODUCT_NO_PERSON_YES_,83,26.0,man,biographies,not_clear_enough
108,58842005559,Level 2: Conventional Morality/Role Conformity...,ORG_NO_PRODUCT_NO_PERSON_YES_,16,7.0,man,philosophy,not_clear_enough
