In [40]:
import pandas as pd
import os

# Create an empty DataFrame with the specified columns
opp_115 = pd.DataFrame(columns=['segment_id', 'policy_id', 'text', 'label'])

# Define the directory path
directory_path = '/Users/shayan/Desktop/LLM/OPP_115_Dataset'

# Iterate over all files in the directory
filenames = sorted(os.listdir(directory_path), key=lambda x: int(x.split('_')[0]))

for filename in filenames:
    if filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Add the column names
        df.columns = ['segment_id', 'text', 'label']
        df['policy_id'] = filename.split('_')[0]

        # Merge labels for rows with the same id and policy_id
        df = df.groupby(['segment_id', 'policy_id'], as_index=False).agg({'text': 'first', 'label': lambda x: ', '.join(x)})
        
        # Concatenate the data to the main DataFrame
        opp_115 = pd.concat([opp_115, df], ignore_index=True)

# Now opp_115 contains all rows from the CSV files in the specified directory
opp_115

Unnamed: 0,segment_id,policy_id,text,label
0,1,20,This privacy policy does not apply to Sites ma...,"Introductory/Generic, Practice not covered"
1,2,20,"By visiting our Sites, you are accepting the p...","Policy Change, Introductory/Generic"
2,3,20,What Information Is Collected? We collect two...,First Party Collection/Use
3,4,20,Personally Identifiable Information As a gene...,"Practice not covered, First Party Collection/Use"
4,5,20,Non-Personally Identifiable Information When ...,First Party Collection/Use
...,...,...,...,...
3720,84,1713,"Further, if you use any of the Communities fea...","Introductory/Generic, Data Security, Third Par..."
3721,85,1713,b. Questions. If you have questions about this...,Privacy contact information
3722,86,1713,c. Sole Statement: This document is the sole s...,Introductory/Generic
3723,87,1713,d. Other: Please review our Terms of Use Agree...,Introductory/Generic


In [None]:
# Define the mapping dictionary
label_mapping = {
    'First Party Collection/Use': 1,
    'Third Party Sharing/Collection': 2,
    'User Choice/Control': 3,
    'User Access, Edit, & Deletion': 4,
    'User Access': 4,
    'Edit and Deletion': 4,
    'Data Retention': 5,
    'Data Security': 6,
    'Policy Change': 7,
    'Do Not Track': 8,
    'International and Specific Audiences': 9,
    'Privacy contact information': 10,
    'Practice not covered': 11,
    'Introductory/Generic': 12
}

only_others = 0
not_only_others = 0

# Replace the label text with corresponding numbers
for index, row in opp_115.iterrows():
    try:
        labels = row['label'].split(',')

        if labels[0].strip().isdigit():
            continue

        if len(labels) == 1 and labels[0] == 'Other':
            only_others += 1
            continue
        elif 'Other' in labels:
            not_only_others += 1
            continue

        mapped_labels = [str(label_mapping[label.strip()]) for label in labels]
        opp_115.at[index, 'label'] = ', '.join(mapped_labels)
    except Exception as e:
        print(e)
        print(row)

print("\n--------------------------------------------------------------------")
print('Number of rows with only "Other" label:', only_others)
print('Number of rows with "Other" label along with other labels:', not_only_others)
opp_115

'Other'
segment_id                                                    5
policy_id                                                    21
text           Information You Give Us : We receive and stor...
label                         First Party Collection/Use, Other
Name: 39, dtype: object
'Other'
segment_id                                                    6
policy_id                                                    21
text           Automatic Information : We receive and store ...
label                         First Party Collection/Use, Other
Name: 40, dtype: object
'Other'
segment_id                                                    8
policy_id                                                    21
text           Mobile : When you download or use apps create...
label                         First Party Collection/Use, Other
Name: 42, dtype: object
'Other'
segment_id                                                    9
policy_id                                                    21


Unnamed: 0,segment_id,policy_id,text,label
0,1,20,This privacy policy does not apply to Sites ma...,"12, 11"
1,2,20,"By visiting our Sites, you are accepting the p...","7, 12"
2,3,20,What Information Is Collected? We collect two...,1
3,4,20,Personally Identifiable Information As a gene...,"11, 1"
4,5,20,Non-Personally Identifiable Information When ...,1
...,...,...,...,...
3720,84,1713,"Further, if you use any of the Communities fea...","Introductory/Generic, Data Security, Third Par..."
3721,85,1713,b. Questions. If you have questions about this...,10
3722,86,1713,c. Sole Statement: This document is the sole s...,12
3723,87,1713,d. Other: Please review our Terms of Use Agree...,12


In [42]:
# Filter out rows where the label contains "Other"
opp_115_filtered = opp_115[~opp_115['label'].str.contains('Other')]

# Reset the index of the DataFrame
opp_115_filtered.reset_index(drop=True, inplace=True)

opp_115_filtered

Unnamed: 0,segment_id,policy_id,text,label
0,1,20,This privacy policy does not apply to Sites ma...,"12, 11"
1,2,20,"By visiting our Sites, you are accepting the p...","7, 12"
2,3,20,What Information Is Collected? We collect two...,1
3,4,20,Personally Identifiable Information As a gene...,"11, 1"
4,5,20,Non-Personally Identifiable Information When ...,1
...,...,...,...,...
3141,83,1713,You are solely responsible for maintaining the...,"12, 6, 11"
3142,85,1713,b. Questions. If you have questions about this...,10
3143,86,1713,c. Sole Statement: This document is the sole s...,12
3144,87,1713,d. Other: Please review our Terms of Use Agree...,12


In [43]:
opp_115_filtered.to_csv('opp_115_dataset_2.csv', index=False)