#### Import packages and read all Json Files

In [1]:
import os
import json
import pandas as pd

directory = "Deception_tagged_json"
all_data = {}

for filename in os.listdir(directory):
    if filename.endswith(".json"):
        file_path = os.path.join(directory, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            all_data[filename] = data

#### All keys

In [2]:
all_data.keys()

dict_keys(['deception_row_10.json', 'deception_row_100.json', 'deception_row_101.json', 'deception_row_102.json', 'deception_row_103.json', 'deception_row_104.json', 'deception_row_105.json', 'deception_row_106.json', 'deception_row_107.json', 'deception_row_108.json', 'deception_row_109.json', 'deception_row_11.json', 'deception_row_110.json', 'deception_row_111.json', 'deception_row_112.json', 'deception_row_113.json', 'deception_row_114.json', 'deception_row_115.json', 'deception_row_116.json', 'deception_row_117.json', 'deception_row_118.json', 'deception_row_119.json', 'deception_row_12.json', 'deception_row_120.json', 'deception_row_121.json', 'deception_row_122.json', 'deception_row_123.json', 'deception_row_124.json', 'deception_row_125.json', 'deception_row_126.json', 'deception_row_127.json', 'deception_row_128.json', 'deception_row_129.json', 'deception_row_13.json', 'deception_row_130.json', 'deception_row_132.json', 'deception_row_133.json', 'deception_row_134.json', 'dece

#### Extract information from the required key and make a dataframe

In [3]:
entities = []
for key, value in all_data.items():
    for annotation in value["annotations"]:
        # add file name to each annotation
        id = key.split(".")[0]
        # add text to each annotation
        text = annotation[0]
        # add entities
        entities_dict = annotation[1]["entities"]
        for entity in entities_dict:
            start = entity[0]
            end = entity[1]
            type = entity[2]
            entities.append([id,text, start, end, type])

df = pd.DataFrame(entities, columns=["Id","Text", "Start", "End", "Type"])

In [4]:
df

Unnamed: 0,Id,Text,Start,End,Type
0,deception_row_10,VTM1-2 are friends and they acquainted with WP...,11,18,METHOD
1,deception_row_10,VTM1-2 are friends and they acquainted with WP...,56,72,CHANNEL
2,deception_row_10,VTM1-2 are friends and they acquainted with WP...,138,256,METHOD
3,deception_row_10,VTM1-2 are friends and they acquainted with WP...,325,342,HOW_PROPERTY_IS_TAKEN_AWAY
4,deception_row_100,"""4. E-Shopping (C2C) Fraud\n Social Me...",141,152,CHANNEL
...,...,...,...,...,...
878,deception_row_99,"""4. E-Shopping (C2C) Fraud\n Social Me...",245,255,AMOUNT
879,deception_row_99,"""4. E-Shopping (C2C) Fraud\n Social Me...",285,288,HOW_PROPERTY_IS_TAKEN_AWAY
880,deception_row_99,"""4. E-Shopping (C2C) Fraud\n Social Me...",358,361,GENDER
881,deception_row_99,"""4. E-Shopping (C2C) Fraud\n Social Me...",365,375,CHANNEL


#### Value counts of the Type column

In [5]:
df['Type'].value_counts()

METHOD                        243
CHANNEL                       202
HOW_PROPERTY_IS_TAKEN_AWAY    178
AMOUNT                        123
GENDER                         61
NUMBER_OF_OCCURRENCE           24
LOG_TYPE                       16
LOC                            16
LOCATION                        8
NATIVE                          6
VICTIM_CAREER                   4
OFFENCE                         2
Name: Type, dtype: int64

#### Trim start and end from the text

In [6]:
# trim start and end of text based on start and end of entity
df['Annotated_text'] = df.apply(lambda x: x['Text'][x['Start']:x['End']], axis=1)
df.head()

Unnamed: 0,Id,Text,Start,End,Type,Annotated_text
0,deception_row_10,VTM1-2 are friends and they acquainted with WP...,11,18,METHOD,friends
1,deception_row_10,VTM1-2 are friends and they acquainted with WP...,56,72,CHANNEL,social gathering
2,deception_row_10,VTM1-2 are friends and they acquainted with WP...,138,256,METHOD,WP claimed to have connection with travel agen...
3,deception_row_10,VTM1-2 are friends and they acquainted with WP...,325,342,HOW_PROPERTY_IS_TAKEN_AWAY,WP’s FPS account.
4,deception_row_100,"""4. E-Shopping (C2C) Fraud\n Social Me...",141,152,CHANNEL,‘Carousell’


#### Exampe from Each Type 

In [None]:
# show example from each type and exclude Text column
df.groupby('Type').first().drop(columns=['Text'])

#### To Excel

In [9]:
# To excel 
df.to_excel("Deception_tagged.xlsx", index=False)