In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [23]:
# assign the column names

colnames=['LineID',
 'Perp_Victim',
 'Timestamp',
 'Message',
 'Line_Risk']

# there are three datasets that I have labeled
df1=pd.read_csv('/content/drive/MyDrive/Omdena STC/c189.csv', encoding='latin', names=colnames)
df2=pd.read_csv('/content/drive/MyDrive/Omdena STC/c263.csv', encoding='latin', names=colnames)
df3=pd.read_csv('/content/drive/MyDrive/Omdena STC/c3302.csv', encoding='latin', names=colnames)

In [24]:
df1.head()

Unnamed: 0,LineID,Perp_Victim,Timestamp,Message,Line_Risk
0,L355644,p,7:33:02 PM,hello,0.0
1,L355645,v,7:33:10 PM,hey,0.0
2,L355646,p,7:33:19 PM,hey,0.0
3,L355647,v,7:33:30 PM,wich 1,0.0
4,L355648,p,7:33:39 PM,well all actually,0.0


In [25]:
df2.head()

Unnamed: 0,LineID,Perp_Victim,Timestamp,Message,Line_Risk
0,L71748,p,21:20,hi. wanna chat?,1
1,L71749,p,9:21 p.m.,ur profile says ur looking for a bf. what age ...,1
2,L71750,v,9:22 p.m.,hiya,0
3,L71751,v,9:22 p.m.,any age,1
4,L71752,p,9:22 p.m.,i'm 21,1


In [26]:
df3.head()

Unnamed: 0,LineID,Perp_Victim,Timestamp,Message,Line_Risk
0,0,1,2,3,4
1,L314797,p,7/13/2006 0:46,hey,0
2,L314798,v,7/13/2006 0:47,hey,0
3,L314799,p,7/13/2006 0:47,hi,0
4,L314800,v,7/13/2006 0:47,hi wats up,0


In [27]:
df3=df3.drop(index=0) # drop the first row as it is not appropriate
df3.head()

Unnamed: 0,LineID,Perp_Victim,Timestamp,Message,Line_Risk
1,L314797,p,7/13/2006 0:46,hey,0
2,L314798,v,7/13/2006 0:47,hey,0
3,L314799,p,7/13/2006 0:47,hi,0
4,L314800,v,7/13/2006 0:47,hi wats up,0
5,L314801,p,7/13/2006 0:48,not too much,0


In [28]:
# drop the null values

df1.dropna(inplace=True)
df2.dropna(inplace=True)
df3.dropna(inplace=True)

In [29]:
df1['Line_Risk']=df1['Line_Risk'].astype('int64')  # convert to integer
df1.head()

Unnamed: 0,LineID,Perp_Victim,Timestamp,Message,Line_Risk
0,L355644,p,7:33:02 PM,hello,0
1,L355645,v,7:33:10 PM,hey,0
2,L355646,p,7:33:19 PM,hey,0
3,L355647,v,7:33:30 PM,wich 1,0
4,L355648,p,7:33:39 PM,well all actually,0


In [30]:
df1.shape, df2.shape, df3.shape

((543, 5), (211, 5), (1070, 5))

In [31]:
df=pd.concat([df1, df2, df3])  # concatenate the three datasets into one
df=df.sample(frac=1)
df.shape

(1824, 5)

In [32]:
df.head()

Unnamed: 0,LineID,Perp_Victim,Timestamp,Message,Line_Risk
94,L314890,p,7/13/2006 1:06,cool,0
151,L314947,v,7/13/2006 1:15,lol,0
436,L356080,v,10:43:35 PM,yea me 2,0
340,L315136,v,7/16/2006 9:56,were do u live for real?,0
356,L315152,v,7/19/2006 22:55,wats up?,0


In [33]:
common_words=['hi', 'me', 'mi', 'you', 'if', 'i', 'my', 'just', 'ur', 'is', 'was', 'cause', 'your', 'because', 'it', 'to', '2', 'n', 'so', 'you?', 'yo', 'hello', 'will', 'and', 'u', 
              'on', 'r', 'what', 'that', 'the', 'am', 'in', 'with', 'we', 'be', 'a', 'b', 'c', 'for', 'she', 'her', 'him', 'his', 'they', 'them', 'have', 'but', 'she', 'your', 'would']

# remove the common words which are not important

df['Message']=df['Message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (common_words)]))

In [34]:
df.head()

Unnamed: 0,LineID,Perp_Victim,Timestamp,Message,Line_Risk
94,L314890,p,7/13/2006 1:06,cool,0
151,L314947,v,7/13/2006 1:15,lol,0
436,L356080,v,10:43:35 PM,yea,0
340,L315136,v,7/16/2006 9:56,were do live real?,0
356,L315152,v,7/19/2006 22:55,wats up?,0


In [35]:
df['Line_Risk'].value_counts()

0    1199
1     563
2      62
Name: Line_Risk, dtype: int64

In [36]:
low_risk=df[df['Line_Risk'] == 1]
high_risk=df[df['Line_Risk'] == 2]

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif, SelectKBest

In [38]:
def feature_identification(data, text_col, target_col, vectorizer, ngram, top_k_features):
    """ This function is used to generate the top k features for determining the context of low-risk and high-risk grooming conversations.It uses word vectorization to map words or 
    phrases with the N-gram model. For identification of the top contexts, it uses a mutual information classifier to select the best features.

    Args:
        data: pass the dataframe
        text_col: pass the text column from which the contexts to be extracted
        target_col: pass the target column which contains the corresponding labels
        vectorizer: set method for vectorization (i.e. Bag of Words (CountVectorizer), Term-Frequency Inverse-Document Frequency (TfidfVectorizer))
        ngram: set up continuous word, symbol, or token sequence in a document. (i.e. unigram-(1, 1), bigram-(2, 2), trigram-(3, 3), uni-bi-(1, 2), uni-tri-(1, 3), bi-tri-(2, 3))
        n_top_features: set desired number of features
    
    Returns:
        Desired features or contexts

    """
    v=vectorizer(ngram_range=ngram)
    X=v.fit_transform(data[text_col]).toarray()
    print(f"Number of generated features for the n_gram {ngram} is: {len(v.get_feature_names())}\n")
    df=pd.DataFrame(X, columns=v.get_feature_names_out())
    y=data[target_col]
    feature_top20=SelectKBest(mutual_info_classif, k=top_k_features)
    feature_top20.fit(X, y)
    return df.columns[feature_top20.get_support()]

In [39]:
# tri-gram analysis for the low risk conversations with Term-Frequency Inverse-Document Frequency

low_risk_features_uni_to_tri=feature_identification(data=low_risk, text_col='Message', target_col='Line_risk', vectorizer=TfidfVectorizer,  ngram=(3, 3), top_k_features=100)
low_risk_features_uni_to_tri

Number of generated features for the n_gram (3, 3) is: 981



Index(['wanna do more', 'wanna friday or', 'wanna hear cute', 'wanna meet ya',
       'wanna see it', 'wanna talk min', 'wanna talk phone', 'wanna use tha',
       'want call or', 'want call tomorrow', 'want can bring', 'want come see',
       'want hang out', 'want he offered', 'want mapquest ya',
       'want number now', 'want pick please', 'want pick up', 'want see today',
       'want some dude', 'want something swim', 'want talk please',
       'want tied up', 'want were anything', 'watch tv drink',
       'watched guy cam', 'way can alone', 'way hotel room', 'we ll lunch',
       'wear some pant', 'wearing hair pony', 'well it gonna', 'well rv most',
       'were anything else', 'were are going', 'were do usually',
       'were gonna take', 'wernt there 2day', 'wet horny from',
       'what most youve', 'what time friday', 'whatever hang out',
       'whatever want he', 'whatevere im tshirt', 'when are rents',
       'when get down', 'when he calls', 'when last relationship',
  

In [40]:
# tri-gram analysis for the high risk conversations Term-Frequency Inverse-Document Frequency

high_risk_features_uni_to_tri=feature_identification(data=high_risk, text_col='Message', target_col='Line_risk', vectorizer=TfidfVectorizer, ngram=(3, 3), top_k_features=100)
high_risk_features_uni_to_tri

Number of generated features for the n_gram (3, 3) is: 301



Index(['really it looks', 'really watch naked', 'reralationship ment last',
       'reralationship soon things', 'right away um', 'right had real',
       'santa cruz about', 'santa cruz wasnt', 'seeing strangers dicks',
       'self can watch', 'send link favorite', 'send more please',
       'set of tits', 'sex right away', 'sex you fucking', 'sexual chick from',
       'sexual reralationship soon', 'sexy like most', 'short period of',
       'shot all over', 'sitting here talking', 'skirt no panties',
       'so tells pretty', 'some bi sexual', 'soon things us',
       'sore after short', 'started having sex', 'stick head between',
       'straight into bed', 'strip love do', 'stripped naked got',
       'suck dick before', 'sum condoms rite', 'sure not going',
       'take off turn', 'talking about how', 'talking boxer briefs',
       'tell which one', 'tells pretty open', 'thanks hard work',
       'thatin xx rated', 'thats while diong', 'there one of',
       'things us werent', 