<a href="https://colab.research.google.com/github/DavinciB/child_grooming_detector/blob/main/ProjectDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn import svm
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn import metrics
import heapq
import operator
import numpy as np
from mpl_toolkits.mplot3d import axes3d, Axes3D
from sklearn.svm import LinearSVC
import xml.etree.ElementTree as ET
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import pickle

In [None]:
def get_labels_dict(data_path):
    labels_dict = {}
    with open(data_path + 'sci_labels.csv', 'r') as f:
        file = csv.reader(f)
        for row in file:
            labels_dict[row[0]] = row[1]
    return labels_dict

def get_features_labels(root, labels_dict):
    corpus = []
    labels = []
    for conversation in root:
        string = " "
        for message in conversation:
            text = message.find('text').text
            if text is not None:
                string = string + "\r\n" + text
        corpus.append(string)
        labels.append(int(labels_dict[conversation.get('id')]))
    return corpus, labels

def get_conversation_id(root):
  conversation_id = []
  for conversation in root:
    conversation_id.append(conversation.get('id'))
  return conversation_id

train_data_path = '/content/drive/MyDrive/online-grooming-detector-master/data/svm_training_data/'
training_xml = ET.parse(train_data_path + 'training_data.xml')
train_root = training_xml.getroot()
test_data_path = '/content/drive/MyDrive/online-grooming-detector-master/data/svm_test_data/'
test_data_src = '/content/drive/MyDrive/online-grooming-detector-master/data/pan12-sexual-predator-identification-test-corpus-2012-05-21/'
test_xml = ET.parse(test_data_src + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
test_root = test_xml.getroot()
train_corpus, train_labels = get_features_labels(train_root, get_labels_dict(train_data_path))
test_corpus, test_labels = get_features_labels(test_root, get_labels_dict(test_data_path))
test_conversations = get_conversation_id(test_root)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)
X_test = scipy.sparse.csr_matrix(X_test)
y_test = np.array(test_labels)

filename = '/content/drive/MyDrive/online-grooming-detector-master/models/GAI_SVM.sav'
loaded_model_GAI = pickle.load(open(filename, 'rb'))
pred_y = loaded_model_GAI.predict(X_test)
print(metrics.accuracy_score(y_test, pred_y))
occurrences = np.count_nonzero(pred_y == 1)
print(occurrences)
for i in range(250):
    print("{} {}".format(test_conversations[i], pred_y[i]))


In [None]:
demo_list_1 = []
for i in range(10000):
  if (pred_y[i] == 1) :
    for conversation in test_root :
      if test_conversations[i] == conversation.get('id') :
        count = 0
        for message in conversation:
          count += 1
        if count > 100 :
          print("{} {}".format(test_conversations[i], pred_y[i]))
          demo_list_1.append(test_conversations[i])

In [None]:
for Id in demo_list_1 :
  for conversation in test_root :
    if Id == conversation.get('id') :
      count = 0
      for message in conversation:
              count += 1
      if count > 25 :
        print("*****************************************************************************************")
        for message in conversation:
              text = message.find('text').text
              print(text)
        print("*****************************************************************************************")

In [None]:
test_corpus = []
test_labels = []
pred_y = []
def get_susp_conv_dict(data_path):
    labels_dict = {}
    with open(data_path + 'sci_labels.csv', 'r') as f:
        file = csv.reader(f)
        for row in file:
            labels_dict[row[0]] = row[1]
    return labels_dict

def get_predators_dict(file): 
    all_predators = {}
    with open(file, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            all_predators[row[0]] = 1
    return all_predators
            
def get_features_labels(root, labels_dict, all_predators):
    corpus = []
    labels = []
    author_list = []
    for conversation in root:
        if labels_dict[conversation.get('id')] == '0':
            continue
        author_conv_dict = {}
        for message in conversation:
            author = message.find('author').text
            text = message.find('text').text
            if text is not None:
                if author not in author_conv_dict:
                    author_conv_dict[author] = text
                else:
                    author_conv_dict[author] += " " + text 
        for author, conv in author_conv_dict.items():
            corpus.append(conv)
            author_list.append(author)
            if author in all_predators:
                labels.append(1)
            else:
                labels.append(-1)
    return corpus, labels, author_list



train_data_path = '/content/drive/MyDrive/online-grooming-detector-master/data/svm_training_data/'
training_xml = ET.parse(train_data_path + 'training_data.xml')
train_root = training_xml.getroot()

test_data_path = '/content/drive/MyDrive/online-grooming-detector-master/data/svm_test_data/'
test_data_src = '/content/drive/MyDrive/online-grooming-detector-master/data/pan12-sexual-predator-identification-test-corpus-2012-05-21/'
test_xml = ET.parse(test_data_src + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
test_root = test_xml.getroot()

pred_train_file_path = '/content/drive/MyDrive/online-grooming-detector-master/data/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt'
pred_test_file_path = '/content/drive/MyDrive/online-grooming-detector-master/data/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem1.txt'
train_corpus, train_labels, train_authors = get_features_labels(train_root, get_susp_conv_dict(train_data_path), get_predators_dict(pred_train_file_path))
test_corpus, test_labels, test_authors = get_features_labels(test_root, get_susp_conv_dict(test_data_path), get_predators_dict(pred_test_file_path))


vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.transform(test_corpus)
X_test = scipy.sparse.csr_matrix(X_test)
y_test = np.array(test_labels)

filename = '/content/drive/MyDrive/online-grooming-detector-master/models/PI_SVM.sav'
loaded_model_PI = pickle.load(open(filename, 'rb'))
pred_y = loaded_model_PI.predict(X_test)
print(metrics.accuracy_score(y_test, pred_y))
occurrences = np.count_nonzero(pred_y == 1)
print(occurrences)
print("Suspicious authors are")
for i in range(len(test_authors)//4):
    print("{} {}".format(test_authors[i], pred_y[i]))