In [213]:
import ast
import pandas as pd
import jellyfish

import re

In [214]:
df = pd.read_csv('/Users/muhammadluay/Desktop/suweilah_test.csv')

In [215]:
# Function to remove unicode characters
def remove_unicode(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

# Apply the function to the 'text' column where 'is_match' is False
df['text'] = df['text'].apply(remove_unicode)


In [216]:
# Function to process each row and concatenate BILOU entities
def process_entities(row):
    entities = ast.literal_eval(row)
    if not entities:
        return []

    grouped_entities = []
    current_entity = []

    for entity in entities:
        tag = entity['entity'][0]  # Get the first character of the entity tag (B, I, L, U, O)

        if tag in ['B', 'U']:
            if current_entity:  # If there is an ongoing entity, save it and start a new one
                grouped_entities.append(current_entity)
            current_entity = [entity]  # Start a new entity group
        elif tag in ['I', 'L']:
            current_entity.append(entity)

        if tag in ['L', 'U']:  # If the entity ends, save it
            grouped_entities.append(current_entity)
            current_entity = []

    # Process grouped entities to concatenate words and get start/end offsets
    result_entities = []
    for group in grouped_entities:
        text = ' '.join([e['word'] for e in group])  # Concatenation
        start_offset = group[0]['start']
        end_offset = group[-1]['end']
        result_entities.append({'text': text, 'start_offset': start_offset, 'end_offset': end_offset})

    return result_entities

# Apply the function to each row in the dataframe
df['processed_entities'] = df['entities_suweilah'].apply(process_entities)

# Function to post-process entities and merge adjacent tokens
def merge_entities(row):
    entities = row
    if not entities:
        return []

    merged_entities = []
    current_entity = {'text': '', 'start_offset': entities[0]['start_offset'], 'end_offset': entities[0]['end_offset']}

    for i, entity in enumerate(entities):
        if i > 0 and entity['start_offset'] == entities[i - 1]['end_offset']:
            current_entity['text'] += entity['text'].replace('##', '')
            current_entity['end_offset'] = entity['end_offset']
        else:
            if current_entity['text']:
                merged_entities.append(current_entity)
            current_entity = {'text': entity['text'].replace('##', ''), 'start_offset': entity['start_offset'], 'end_offset': entity['end_offset']}

    merged_entities.append(current_entity)  # Append the last entity

    return merged_entities


# Apply the post-processing function
df['merged_entities'] = df['processed_entities'].apply(merge_entities)

In [217]:
df['entities_suweilah']

0       [{'entity': 'B-HPOI', 'score': 0.68544704, 'in...
1                                                      []
2                                                      []
3       [{'entity': 'B-STAT', 'score': 0.99427706, 'in...
4                                                      []
                              ...                        
4061    [{'entity': 'B-CTRY', 'score': 0.99390113, 'in...
4062    [{'entity': 'U-CTRY', 'score': 0.91073656, 'in...
4063                                                   []
4064    [{'entity': 'U-CTRY', 'score': 0.9905347, 'ind...
4065    [{'entity': 'U-CTRY', 'score': 0.9896634, 'ind...
Name: entities_suweilah, Length: 4066, dtype: object

In [218]:
df['text']

0       The Hilton Myrtle Beach Resort reported this m...
1       (Hurricane Florence damage estimated at $17 bi...
2       If you have older family members in path of th...
3       Please donate if you can - #HurricaneFlorence ...
4       Hurricane Preparedness Tip: Charge your cordle...
                              ...                        
4061    RT @TheWatchers_: 202 killed, 94 still missing...
4062    RT @Newscloudlk: Government of Maldives donate...
4063                   Government Donations #FloodSL #lka
4064    We sustained stationary material to 440 kids i...
4065    Death toll rises to 119 while over 150 missing...
Name: text, Length: 4066, dtype: object

In [219]:
df.columns

Index(['tweet_id', 'text', 'entities_suweilah', 'processed_entities',
       'merged_entities'],
      dtype='object')

In [220]:
df['merged_entities'][4051]

[{'text': 'SriL', 'start_offset': 28, 'end_offset': 32},
 {'text': 'ka', 'start_offset': 34, 'end_offset': 36},
 {'text': 'SriL', 'start_offset': 52, 'end_offset': 56},
 {'text': 'ka', 'start_offset': 58, 'end_offset': 60}]

In [221]:
df['suweilah_labels'] = df['merged_entities'].apply(lambda x: [item['text'] for item in x])

In [222]:
df['suweilah_labels'].tail(50)

4016                                 [China]
4017                             [Sri Lanka]
4018                              [SriL, ka]
4019                             [Sri Lanka]
4020                             [Sri Lanka]
4021                              [SriL, ka]
4022                             [Sri Lanka]
4023                             [Sri Lanka]
4024                              [SriL, ka]
4025                            [Bangladesh]
4026                              [Cornwall]
4027                       [Akka, Sri Lanka]
4028                              [SriL, ka]
4029                            [Sri Lankan]
4030                                      []
4031               [Mall ika Na a School, G]
4032                                      []
4033                              [SriL, ka]
4034                                  [sril]
4035                              [SriL, ka]
4036                             [Sri Lanka]
4037                             [Sri Lanka]
4038      

In [223]:
malformed_df = pd.read_csv('/Users/muhammadluay/Desktop/Zindi/IDRISI-main/scripts/process/malformed_suweilah.csv')

In [224]:
malformed_df.head(50)

Unnamed: 0.1,Unnamed: 0,malformed_label,correct_label
0,0,Win,Winnabow
1,1,Carolina,Carolinas
2,2,R,RICHMOND
3,3,NorthCaro,NorthCarolina
4,4,Virginia,Virginias
5,5,Ho,Holshouser Building
6,6,N C,N.C.
7,7,Le,Leland
8,8,Le,Leland
9,9,Le,Leland


In [225]:
df['text']

0       The Hilton Myrtle Beach Resort reported this m...
1       (Hurricane Florence damage estimated at $17 bi...
2       If you have older family members in path of th...
3       Please donate if you can - #HurricaneFlorence ...
4       Hurricane Preparedness Tip: Charge your cordle...
                              ...                        
4061    RT @TheWatchers_: 202 killed, 94 still missing...
4062    RT @Newscloudlk: Government of Maldives donate...
4063                   Government Donations #FloodSL #lka
4064    We sustained stationary material to 440 kids i...
4065    Death toll rises to 119 while over 150 missing...
Name: text, Length: 4066, dtype: object

In [226]:
corrected_labels = {}

# Loop through each malformed label in the df['suweilah_labels'] series
for index, labels in df['suweilah_labels'].items():
    # print(index)
    text = df['text'][index]  # Get the corresponding text
    corrected = []
    for label in labels:
        candidates = []  # To store potential correct labels
        for _, row in malformed_df.iterrows():
            malformed_label = row['malformed_label']
            correct_label = row['correct_label']
            # Calculate the Jaro similarity
            similarity = jellyfish.jaro_similarity(label.lower(), malformed_label.lower())
            if similarity > 0.8:
                candidates.append(correct_label)
        
        # Check the original text for the most suitable correct label
        if candidates:
            max_similarity = 0
            selected_label = ""
            for candidate in candidates:
                if candidate.lower() in text.lower() and len(candidate) > len(selected_label):
                    selected_label = candidate
            
            corrected.append(selected_label if selected_label else label)
        else:
            corrected.append(label)
    
    corrected_labels[index] = corrected

# Add the corrected labels to the dataframe
df['corrected_labels'] = pd.Series(corrected_labels)

In [227]:
df[['suweilah_labels','corrected_labels']].tail(50)

Unnamed: 0,suweilah_labels,corrected_labels
4016,[China],[China]
4017,[Sri Lanka],[Sri lanka]
4018,"[SriL, ka]","[srilanka, ka]"
4019,[Sri Lanka],[Sri lanka]
4020,[Sri Lanka],[Sri lanka]
4021,"[SriL, ka]","[srilanka, ka]"
4022,[Sri Lanka],[Sri lanka]
4023,[Sri Lanka],[Sri lanka]
4024,"[SriL, ka]","[srilanka, ka]"
4025,[Bangladesh],[Bangladesh]


In [228]:
df['merged_entities'][4031]

[{'text': 'Mall ika Na a School', 'start_offset': 82, 'end_offset': 104},
 {'text': 'G', 'start_offset': 108, 'end_offset': 109}]

In [229]:
df['text'][0]

'The Hilton Myrtle Beach Resort reported this morning that the Grand Strand suffered only minor damage from #Florence and that the Hilton, with no damage, is back open, fully operational and ready to welcome us this weekend. #SLRCC'

In [230]:
# Function to process each row according to the provided conditions
def process_row(row):
    suweilah_labels = row['suweilah_labels']
    corrected_labels = row['corrected_labels']
    merged_entities = row['merged_entities']
    merged_entities2 = []

    # If the lengths of the suweilah_labels and corrected_labels lists are the same
    if len(suweilah_labels) == len(corrected_labels):
        for i in range(len(suweilah_labels)):
            entity = merged_entities[i]  # Get the corresponding entity
            
            # If the length of the individual labels is the same
            if len(suweilah_labels[i]) == len(corrected_labels[i]):
                merged_entities2.append(entity)
            else:
                # If the lengths are different, use the longer label and update the end_offset
                entity['text'] = corrected_labels[i]
                entity['end_offset'] = entity['start_offset'] + len(corrected_labels[i])
                merged_entities2.append(entity)

    return merged_entities2

# Apply the process_row function to each row in the DataFrame
df['merged_entities2'] = df.apply(process_row, axis=1)


In [231]:
def remove_overlapping_entities(entities):
    if not entities:
        return []
    
    non_overlapping_entities = []
    entities.sort(key=lambda x: x['end_offset'])  # Sort by end_offset to make the comparison easier
    
    previous_entity = None
    for entity in entities:
        if previous_entity:
            # If the end_offset is the same and the start_offset is larger, skip the entity
            if entity['end_offset'] == previous_entity['end_offset'] and entity['start_offset'] > previous_entity['start_offset']:
                continue
            
        non_overlapping_entities.append(entity)
        previous_entity = entity
    
    return non_overlapping_entities

# Applying the remove_overlapping_entities function
df['merged_entities2'] = df['merged_entities2'].apply(remove_overlapping_entities)


In [232]:
df['updated_labels2'] = df['merged_entities2'].apply(lambda x: [item['text'] for item in x])


In [233]:
# Function to check if a string has non-ASCII characters
def has_non_ascii(s):
    return not all(ord(c) < 128 for c in s)

# Function to replace '@' followed by any word with '@' followed by '^' of the same length as the word
def replace_at_word(text):
    return re.sub(r'@(\w+)', lambda x: '@' + '^' * len(x.group(1)), text)

df['text_cleaned'] = df['text'].apply(replace_at_word)

def refine_labels_and_entities(row):
    new_labels = []
    new_entities = []
    for label, entity in zip(row['updated_labels2'], row['merged_entities2']):
        # Extract the part of text_cleaned using start and end offsets
        text_part = row['text_cleaned'][entity['start_offset']:entity['end_offset']]
        
        # if not has_non_ascii(label) and '^' not in text_part and label != in ['S', 'the]':
        if not has_non_ascii(label) and '^' not in text_part and label not in ['S']:

            new_labels.append(label)
            new_entities.append(entity)
    row['updated_labels2'] = new_labels
    row['merged_entities2'] = new_entities
    return row

df = df.apply(refine_labels_and_entities, axis=1)

In [234]:
def check_is_match(row):
    is_match_list = []
    for entity in row['merged_entities2']:
        # Extract the part of text using start and end offsets
        text_part = row['text'][entity['start_offset']:entity['end_offset']]
        
        # Check if text_part matches the text value in merged_entities2 for that entity
        is_match_list.append(text_part == entity['text'])
    return is_match_list

df['is_match'] = df.apply(check_is_match, axis=1)

In [235]:
fp_df = pd.read_csv('/Users/muhammadluay/Library/Containers/com.microsoft.Excel/Data/Desktop/Zindi/IDRISI-main/scripts/EDA/fp_analysis.csv')

In [236]:
# Filter fp_df for rows where FPP <= 50
filtered_fp_df = fp_df[fp_df['False Positive Percentage'] <= 50]

def get_fp_pred_entity(text_cleaned):
    entities = []
    for label in filtered_fp_df['Label']:
        start_offset = text_cleaned.lower().find(label.lower())
        if start_offset != -1:
            end_offset = start_offset + len(label)
            entities.append({'text': text_cleaned[start_offset:end_offset], 'start_offset': start_offset, 'end_offset': end_offset})
    return entities

df['fp_pred_entity'] = df['text_cleaned'].apply(get_fp_pred_entity)

In [237]:
# Creating explicit ts and te columns in the DataFrame
for i in range(1, 18):  # Assuming up to 17 entities
    df[f'ts{i}'] = None
    df[f'te{i}'] = None

# Function to fill ts and te columns based on merged_entities2
def fill_ts_te(row):
    entities = row['merged_entities2']
    
    for i, entity in enumerate(entities, start=1):
        if i > 17:  # We're only considering up to 17 entities
            break
        
        # Assign start_offset and end_offset to ts and te columns respectively
        row[f'ts{i}'] = entity['start_offset']
        row[f'te{i}'] = entity['end_offset']
    
    return row

# Applying the fill_ts_te function to each row in the DataFrame
df = df.apply(fill_ts_te, axis=1)

# Print the DataFrame to check the updated ts and te columns
cols_to_display = ['merged_entities2'] + [f'ts{i}' for i in range(1, 18)] + [f'te{i}' for i in range(1, 18)]


In [238]:
df[['merged_entities2', 'ts1', 'te1', 'ts2', 'te2', 'ts3', 'te3', 'ts4', 'te4']]


Unnamed: 0,merged_entities2,ts1,te1,ts2,te2,ts3,te3,ts4,te4
0,"[{'text': 'Hilton Myrtle Beach Resort', 'start...",4.0,30.0,,,,,,
1,[],,,,,,,,
2,[],,,,,,,,
3,"[{'text': 'South Carolina', 'start_offset': 98...",98.0,112.0,,,,,,
4,[],,,,,,,,
...,...,...,...,...,...,...,...,...,...
4061,"[{'text': 'Sri Lanka', 'start_offset': 88, 'en...",88.0,97.0,,,,,,
4062,"[{'text': 'Maldives', 'start_offset': 31, 'end...",31.0,39.0,78.0,87.0,,,,
4063,[],,,,,,,,
4064,"[{'text': 'srilanka', 'start_offset': 59, 'end...",59.0,67.0,,,,,,


In [239]:
df = df.fillna(0)

In [240]:
df[cols_to_display].head(50)

Unnamed: 0,merged_entities2,ts1,ts2,ts3,ts4,ts5,ts6,ts7,ts8,ts9,...,te8,te9,te10,te11,te12,te13,te14,te15,te16,te17
0,"[{'text': 'Hilton Myrtle Beach Resort', 'start...",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
2,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
3,"[{'text': 'South Carolina', 'start_offset': 98...",98.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
4,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
5,"[{'text': 'New Bern', 'start_offset': 46, 'end...",46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
6,"[{'text': 'Richmond', 'start_offset': 105, 'en...",105.0,115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
7,"[{'text': 'Craven County', 'start_offset': 63,...",63.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
8,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
9,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0


In [241]:
get_ids = '/Users/muhammadluay/Desktop/Zindi/IDRISI-main/scripts/all_test.jsonl'

In [242]:
ids_df = pd.read_json(get_ids, lines=True, dtype={'tweet_id': str})


In [243]:
ids_df

Unnamed: 0,tweet_id,user_id,text,created_at,humAID_class
0,1042006783957577729,2441057039,The Hilton Myrtle Beach Resort reported this m...,2018-09-18 11:05:51+00:00,infrastructure_and_utility_damage
1,1041941440157212674,1017086923507040256,(Hurricane Florence damage estimated at $17 bi...,2018-09-18 06:46:12+00:00,infrastructure_and_utility_damage
2,1039889676117659648,3021716824,If you have older family members in path of th...,2018-09-12 14:53:13+00:00,caution_and_advice
3,1041635465366003718,81013884,Please donate if you can - #HurricaneFlorence ...,2018-09-17 10:30:22+00:00,rescue_volunteering_or_donation_effort
4,1039518647528521729,705075336220815360,Hurricane Preparedness Tip: Charge your cordle...,2018-09-11 14:18:53+00:00,caution_and_advice
...,...,...,...,...,...
4061,870535125099884544,33860798,"RT @TheWatchers_: 202 killed, 94 still missing...",2017-06-02 06:58:42+00:00,injured_or_dead_people
4062,870533145472507904,868853805374349312,RT @Newscloudlk: Government of Maldives donate...,2017-06-02 06:50:50+00:00,rescue_volunteering_or_donation_effort
4063,871272107627425793,3164273462,Government Donations #FloodSL #lka,2017-06-04 07:47:12+00:00,rescue_volunteering_or_donation_effort
4064,872209745653923842,2922035136,We sustained stationary material to 440 kids i...,2017-06-06 21:53:02+00:00,rescue_volunteering_or_donation_effort


In [244]:
df['tweet_id'] = ids_df['tweet_id']

In [245]:
df

Unnamed: 0,tweet_id,text,entities_suweilah,processed_entities,merged_entities,suweilah_labels,corrected_labels,merged_entities2,updated_labels2,text_cleaned,...,ts13,te13,ts14,te14,ts15,te15,ts16,te16,ts17,te17
0,1042006783957577729,The Hilton Myrtle Beach Resort reported this m...,"[{'entity': 'B-HPOI', 'score': 0.68544704, 'in...","[{'text': 'Hilton Myrtle Beach Resort', 'start...","[{'text': 'Hilton Myrtle Beach Resort', 'start...",[Hilton Myrtle Beach Resort],[Hilton Myrtle Beach Resort],"[{'text': 'Hilton Myrtle Beach Resort', 'start...",[Hilton Myrtle Beach Resort],The Hilton Myrtle Beach Resort reported this m...,...,0,0,0,0,0,0,0,0,0,0
1,1041941440157212674,(Hurricane Florence damage estimated at $17 bi...,[],[],[],[],[],[],[],(Hurricane Florence damage estimated at $17 bi...,...,0,0,0,0,0,0,0,0,0,0
2,1039889676117659648,If you have older family members in path of th...,[],[],[],[],[],[],[],If you have older family members in path of th...,...,0,0,0,0,0,0,0,0,0,0
3,1041635465366003718,Please donate if you can - #HurricaneFlorence ...,"[{'entity': 'B-STAT', 'score': 0.99427706, 'in...","[{'text': 'South Carolina', 'start_offset': 98...","[{'text': 'South Carolina', 'start_offset': 98...",[South Carolina],[South Carolina],"[{'text': 'South Carolina', 'start_offset': 98...",[South Carolina],Please donate if you can - #HurricaneFlorence ...,...,0,0,0,0,0,0,0,0,0,0
4,1039518647528521729,Hurricane Preparedness Tip: Charge your cordle...,[],[],[],[],[],[],[],Hurricane Preparedness Tip: Charge your cordle...,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4061,870535125099884544,"RT @TheWatchers_: 202 killed, 94 still missing...","[{'entity': 'B-CTRY', 'score': 0.99390113, 'in...","[{'text': 'Sri Lanka', 'start_offset': 88, 'en...","[{'text': 'Sri Lanka', 'start_offset': 88, 'en...",[Sri Lanka],[Sri lanka],"[{'text': 'Sri Lanka', 'start_offset': 88, 'en...",[Sri Lanka],"RT @^^^^^^^^^^^^: 202 killed, 94 still missing...",...,0,0,0,0,0,0,0,0,0,0
4062,870533145472507904,RT @Newscloudlk: Government of Maldives donate...,"[{'entity': 'U-CTRY', 'score': 0.91073656, 'in...","[{'text': 'Mal', 'start_offset': 31, 'end_offs...","[{'text': 'Maldives', 'start_offset': 31, 'end...","[Maldives, Sri Lanka]","[Maldives, Sri lanka]","[{'text': 'Maldives', 'start_offset': 31, 'end...","[Maldives, Sri Lanka]",RT @^^^^^^^^^^^: Government of Maldives donate...,...,0,0,0,0,0,0,0,0,0,0
4063,871272107627425793,Government Donations #FloodSL #lka,[],[],[],[],[],[],[],Government Donations #FloodSL #lka,...,0,0,0,0,0,0,0,0,0,0
4064,872209745653923842,We sustained stationary material to 440 kids i...,"[{'entity': 'U-CTRY', 'score': 0.9905347, 'ind...","[{'text': 'Sri', 'start_offset': 59, 'end_offs...","[{'text': 'srilanka', 'start_offset': 59, 'end...","[SriL, ka]","[srilanka, ka]","[{'text': 'srilanka', 'start_offset': 59, 'end...",[srilanka],We sustained stationary material to 440 kids i...,...,0,0,0,0,0,0,0,0,0,0


In [246]:
df[['tweet_id'] + cols_to_display]

Unnamed: 0,tweet_id,merged_entities2,ts1,ts2,ts3,ts4,ts5,ts6,ts7,ts8,...,te8,te9,te10,te11,te12,te13,te14,te15,te16,te17
0,1042006783957577729,"[{'text': 'Hilton Myrtle Beach Resort', 'start...",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,1041941440157212674,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
2,1039889676117659648,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
3,1041635465366003718,"[{'text': 'South Carolina', 'start_offset': 98...",98.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
4,1039518647528521729,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4061,870535125099884544,"[{'text': 'Sri Lanka', 'start_offset': 88, 'en...",88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
4062,870533145472507904,"[{'text': 'Maldives', 'start_offset': 31, 'end...",31.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
4063,871272107627425793,[],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
4064,872209745653923842,"[{'text': 'srilanka', 'start_offset': 59, 'end...",59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0


In [247]:
# Holds the transformed data
transformed_data = []

# Iterating over each row in the DataFrame
for index, row in df.iterrows():
    tweet_id = row['tweet_id']

    for loc in range(1, 18):  # For each location from 1 to 17
        # Appending ts data
        start_id = f"ID_{tweet_id}_loc{loc}_start"
        start_value = row[f'ts{loc}']
        transformed_data.append([start_id, start_value])

        # Appending te data
        end_id = f"ID_{tweet_id}_loc{loc}_end"
        end_value = row[f'te{loc}']
        transformed_data.append([end_id, end_value])

# Creating a new DataFrame from the transformed data
transformed_df = pd.DataFrame(transformed_data, columns=['Tweet_ID', 'Value'])


In [248]:
transformed_df

Unnamed: 0,Tweet_ID,Value
0,ID_1042006783957577729_loc1_start,4.0
1,ID_1042006783957577729_loc1_end,30.0
2,ID_1042006783957577729_loc2_start,0.0
3,ID_1042006783957577729_loc2_end,0.0
4,ID_1042006783957577729_loc3_start,0.0
...,...,...
138239,ID_871208155463680000_loc15_end,0.0
138240,ID_871208155463680000_loc16_start,0.0
138241,ID_871208155463680000_loc16_end,0.0
138242,ID_871208155463680000_loc17_start,0.0


In [250]:
transformed_df.to_csv('submit_suweilah17.csv', index=False)