# Tags: Processing user-generated tags 

In [1]:
# For data manipulation and analysis
import pandas as pd
import numpy as np

# For text preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import datetime
import string

# For multilabel classification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


# For model evaluation
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix


ImportError: Unable to import required dependencies:
numpy: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

Only re-run the code below (uncomment) if trying to get a new subset of data in - since subset is taken from here

In [7]:
tags = pd.read_csv("../dataset/ml-20m/tags.csv")

# run for the full subset for pos tagging for CB model
tags = pd.read_csv("../dataset/tags_full.csv")


### List of commonly used movie/tv shorthand notations
Including: notations, country codes (only including countries where top movies are created), ratings
Don't remove these

In [8]:
keep = []

# Country Codes for prominent film industries (ISO 3166-1 alpha-2 and alpha-3)
country_codes = [
    "US", "USA",  # United States
    "IN", "IND",  # India
    "GB", "GBR",  # United Kingdom
    "FR", "FRA",  # France
    "DE", "DEU",  # Germany
    "CN", "CHN",  # China
    "IT", "ITA",  # Italy
    "JP", "JPN",  # Japan
    "KR", "KOR",  # South Korea
    "RU", "RUS",  # Russia
    "AU", "AUS",  # Australia
    "CA", "CAN",  # Canada
    "ES", "ESP",  # Spain
    "BR", "BRA",  # Brazil
    "MX", "MEX"   # Mexico
]

keep = [
    "BBC", "CNN", "HBO", "FX", "MTV", "ESPN", "AMC", "TNT", "TBS", "VH1",
    "HD", "SD", "4K", "HDR", "UHD", "IMAX", "DV",
    "DD", "DTS", "THX",
    "OTT", "VOD", "DVR", "PPV", "FTA"
]

keep = keep + country_codes + [
    "G", "PG", "PG-13", "R", "NC-17", "U", "UA", "A", "S",
    "MA", "TV-Y", "TV-Y7", "TV-G", "TV-PG", "TV-14", "TV-MA"
]


**Subset Data (30% of users )** - Save in subset_file.csv (this is the subset that is tested on)
- Static subset 
- Run the below code once, then comment out. Otherwise, this changes the subset. 

In [9]:
# user_frac = 0.3
# # Get a random sample of unique userIds
# tags = tags_full
# unique_user_ids = tags['userId'].unique()
# subset_user_ids = np.random.choice(unique_user_ids, size=int(len(unique_user_ids) * user_frac), replace=False)
# tags = tags[tags['userId'].isin(subset_user_ids)]
# tags.to_csv('../dataset/subset.csv',index=False) # one once then never run again unless testing/increasing users

# reading in the subset 
tags = pd.read_csv('../dataset/subset.csv')

# data conversions
dt_dict = {'userId' : 'int', 'movieId' : 'int', 'tag' : 'str'} 
tags = tags.astype(dt_dict)


len(tags)


109313

In [10]:
# To find the number of distinct userIds
distinct_userIds = tags['userId'].nunique()
print(f"The number of distinct userIds is {distinct_userIds}")

The number of distinct userIds is 2225


In [11]:

# Only take single word tags
# - Removing entries with multiple word tags
tags = tags[tags['tag'].apply(lambda x: len(x.split()) == 1)]

len(tags)

54219

In [12]:
tags.drop(columns='Unnamed: 0')


Unnamed: 0,Unnamed: 0.1,userId,movieId,tag,timestamp
7,266,318,260,1970s,2015-02-20 22:42:49
8,267,318,115149,Action,2015-02-21 15:58:30
15,274,320,2762,twist,2006-04-25 11:33:52
16,275,320,2959,twist,2006-04-25 11:30:58
17,276,320,3996,overrated,2006-04-25 11:32:28
...,...,...,...,...,...
109306,390955,138280,116797,history,2015-01-30 23:07:25
109307,390956,138280,116797,informatics,2015-01-30 23:07:35
109308,390957,138280,116797,mathematics,2015-01-30 23:07:17
109310,390959,138280,117871,image,2015-01-30 23:09:16


In [13]:
grouped_data = tags.groupby(['userId', 'movieId']).size().reset_index(name='num_tags')
average_tags_per_movie_per_user = grouped_data['num_tags'].mean()
print(f"The average number of tags that a userId has given a movie is {average_tags_per_movie_per_user}")


The average number of tags that a userId has given a movie is 2.105265201522094


In [14]:
# To find the number of distinct userIds
distinct_userIds = tags['userId'].nunique()
print(f"The number of distinct userIds is {distinct_userIds}")

The number of distinct userIds is 1699


Removing empty string tags

In [15]:
tags = tags[tags['tag'] != '']
tags = tags[tags['tag'] != None]

English Language Only

- Using FastText model

In [16]:
import sys
!{sys.executable} -m pip install fasttext





In [17]:
# No. of tags (all languages)
len_all = len(tags['tag'].unique())

- Check whether this should be done before or after lemma

In [19]:
import fasttext

# Load the model
language_model = fasttext.load_model("../pretrain_model/lid.176.bin")

# Define a function to detect language
def is_english(text):
    try:
        predictions = language_model.predict(text, k=1)
        return predictions[0][0] == '__label__en'
    except:
        return False

# Assuming `tags` is your DataFrame and it has a `tag` column
tags['is_english'] = tags['tag'].apply(is_english)

# Filter rows where the tag is in English and 
tags = tags.loc[(tags['is_english']) | ((~tags['is_english']) & tags['tag'].isin(keep))]

# Drop the 'is_english' column as it's no longer needed
tags.drop(columns=['is_english'], inplace=True)

# No. of tags (ENGLISH)
len_eng = len(tags['tag'].unique())

# calculate the percentage of English tags:
per = len_eng/len_all * 100
print(str(per) + " %")

100.0 %


In [20]:
from collections import Counter

# Initialize Counter
tag_counter = Counter(tags['tag'])

# Filter tags based on length and count frequency of short tags
short_tags_counter = {k: v for k, v in tag_counter.items() if len(k) < 4}

# Sort by frequency in descending order
sorted_short_tags = {k: v for k, v in sorted(short_tags_counter.items(), key=lambda item: item[1], reverse=True)}

# Output frequencies of short tags
print("Frequency of short tags in descending order:")
for tag, freq in sorted_short_tags.items():
    print(f"{tag}: {freq}")

Frequency of short tags in descending order:
R: 326
DVD: 188
own: 142
get: 53
cgi: 47
2.5: 43
War: 41
3.5: 34
buy: 29
art: 28
80s: 27
3d: 24
Gay: 21
1: 21
G: 20
dog: 19
CGI: 18
wry: 16
SF: 15
DC: 14
FBI: 14
60s: 10
odd: 10
90s: 10
f: 10
law: 8
oil: 8
NE: 8
Old: 7
70s: 7
Art: 6
cat: 6
MT: 6
old: 5
NYC: 5
MMA: 4
pub: 4
hs: 4
bad: 4
30s: 4
ok: 3
meh: 3
ice: 3
DIY: 3
UK: 3
wtf: 3
WWI: 2
zoo: 2
bio: 2
box: 2
gun: 2
men: 2
dry: 2
fbb: 2
Sea: 2
tea: 2
eye: 2
bus: 2
all: 2
s: 2
Cat: 2
POW: 2
tps: 1
ss: 1
bc: 1
pig: 1
WHO: 1
Gun: 1
Boy: 1
toy: 1
run: 1
egg: 1
cb: 1
orc: 1
red: 1
wit: 1
UAV: 1
WTF: 1
Doc: 1
wy: 1
cue: 1
koo: 1
beh: 1
4.5: 1
phd: 1
JR: 1
SNL: 1
bjj: 1
PKD: 1
Rap: 1
X: 1
nyc: 1
wig: 1
bed: 1
ABA: 1
daf: 1
DPD: 1
Sad: 1
edw: 1
fox: 1
c: 1
a: 1
SS: 1
USN: 1
GDR: 1
AI: 1
Mob: 1
WTO: 1
Hal: 1
car: 1
McG: 1
SAS: 1
ect: 1
paz: 1
age: 1
see: 1
RAF: 1
Law: 1


Exploring the tags
- Length
- Topic
- Semantics
- Unreliable tags: based on NER and semantic
Consider -> using a pre-trained model for KNOWN words, and then writing an algorithm to process UNKNOWN words


In [21]:
tags

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp
7,266,301,318,260,1970s,2015-02-20 22:42:49
8,267,304,318,115149,Action,2015-02-21 15:58:30
15,274,312,320,2762,twist,2006-04-25 11:33:52
16,275,313,320,2959,twist,2006-04-25 11:30:58
17,276,314,320,3996,overrated,2006-04-25 11:32:28
...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16


### Pre-processing

Order of the preprocessing steps is important. 

1. Lowercase

2. Remove punctuation, symbols

3. English restriction 

4. Spellchecking:
- This is applied before stemming and tokenisation. 
- Ensures valid words are considered before lemmatisation


5. Remove stop words

6. Lemmatisation

Justification of order:
- Spellcheck before tokenisation allows this process to be contextually informed based on the order of words


### 0) Remove conjoined words 
- some tags may be multiple words in one -> remove these


In [26]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /Users/jiayi/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [27]:
from nltk.corpus import words, wordnet

words = set(words.words())



def check_conjoined(tag, words):
    i = 0 
    conjoined_ls = []
    while i < len(tag):
        for x in range(len(tag), i, -1):
            subword = tag[i:x]
            if subword in words or bool(wordnet.synsets(subword)):
                conjoined_ls.append(subword)
                i = x
                break
        else:
            i += 1
    if len(conjoined_ls) > 1:
        return True
    else:
        return False
                

tags['is_conjoined'] = tags['tag'].apply(lambda x: check_conjoined(x, words))

tags = tags[tags['is_conjoined'] == False]



##### 1) Lowercase: Convert to lowercase

In [28]:
tags['tag'] = tags['tag'].str.lower() #lowercase


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['tag'] = tags['tag'].str.lower() #lowercase


##### 2) Remove punctuation, symbols, numbers
- only removing from words that are not in the keep list

In [29]:
# Keep words in 'keep' as is, remove non-alphabetic characters from all other words
tags['tag'] = tags['tag'].apply(lambda x: x if x in keep else ''.join(c for c in x if c.isalpha()))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['tag'] = tags['tag'].apply(lambda x: x if x in keep else ''.join(c for c in x if c.isalpha()))


In [30]:
# Filter the DataFrame to only include rows where the 'tag' column contains a hyphen
hyphen_tags = tags[tags['tag'].str.contains('-', na=False)]

# Print these rows or just the 'tag' column
print(hyphen_tags['tag'])

exclam_tags = tags[tags['tag'].str.contains('!', na=False)]
print(exclam_tags['tag'])



Series([], Name: tag, dtype: object)
Series([], Name: tag, dtype: object)


##### 3) Spellchecking:

Current method: spello python package (ref: https://pypi.org/project/spello/)
- Output is placed in a separate column

Need to consider: 
- slang
- abbreviations

Limitations:
Future Scope / Limitations
One of the limitations of the current model is, it does not suggest corrections for any grammatical mistakes or for words in the vocabulary of the model. For example, in a sentence “I want to by Apple”, it will not suggest any correction for “by” as it is a valid English word but the correct replacement should be "buy".

- **Potential Solution**: Can consider training the data on actors names or slang, manually impute this in

These are difficult to handle with the contextual spell check, solution: use more advanced spell checker informer by context, or create a custom solution
--> https://huggingface.co/facebook/bart-base



In [31]:
tags


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp,is_conjoined
7,266,301,318,260,s,2015-02-20 22:42:49,False
8,267,304,318,115149,action,2015-02-21 15:58:30,False
15,274,312,320,2762,twist,2006-04-25 11:33:52,False
16,275,313,320,2959,twist,2006-04-25 11:30:58,False
17,276,314,320,3996,overrated,2006-04-25 11:32:28,False
...,...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25,False
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35,False
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17,False
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16,False


Need to load pretrained English model file for spello:
Path: located in Desktop/Thesis/... path 

In [37]:
# from spello.model import SpellCorrectionModel
# sp = SpellCorrectionModel(language='en')
# sp.load('../pretrain_model/spello/en.pkl') # Need to download this locally

from spello.model import SpellCorrectionModel

# Step 1: Load the model
sp = SpellCorrectionModel(language='en')
sp.load('../pretrain_model/spello/en.pkl')

# Step 2: Apply configurations
sp.config.min_length_for_spellcorrection = 4  # You can adjust this based on your needs
sp.config.max_length_for_spellcorrection = 12  # You can adjust this based on your needs

# Step 3: Save the model again
sp.save(model_save_dir='../pretrain_model/spello')

# Now, the model should load without any warnings in the future



from spello.model import SpellCorrectionModel 
sp = SpellCorrectionModel(language='en')  
sp.load('/home/ubuntu/model.pkl')
sp.config.min_length_for_spellcorrection = 4 # default is 3
sp.config.max_length_for_spellcorrection = 12 # default is 15
sp.save(model_save_dir='/home/ubuntu/')




'/Users/jiayi/Desktop/Courses/Research pathway/Maryam/Code/pretrain_model/spello/model.pkl'

In [38]:
def spellCheckSuggest(tag):
    '''Apply spello python package to the tag column'''
    if tag in keep:
        return tag  # return the original tag if it's in the 'keep' list
    else:
        spell_check = sp.spell_correct(tag)
        suggest = spell_check['spell_corrected_text']
        return suggest  # return the spell-corrected tag otherwise

# Applying the function to the DataFrame
tags['spellCheckSuggestDoc'] = tags['tag'].apply(spellCheckSuggest)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['spellCheckSuggestDoc'] = tags['tag'].apply(spellCheckSuggest)


In [39]:
tags

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp,is_conjoined,spellCheckSuggestDoc
7,266,301,318,260,s,2015-02-20 22:42:49,False,s
8,267,304,318,115149,action,2015-02-21 15:58:30,False,action
15,274,312,320,2762,twist,2006-04-25 11:33:52,False,twist
16,275,313,320,2959,twist,2006-04-25 11:30:58,False,twist
17,276,314,320,3996,overrated,2006-04-25 11:32:28,False,overrated
...,...,...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25,False,history
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35,False,informatics
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17,False,mathematics
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16,False,image


**Heuristics to ensure incorrect spellcheck results are not replacements of correct words**
- Length Difference: If the difference in length between the original word and the corrected word is significant, it could be an incorrect correction.

- Edit Distance: Utilize the Levenshtein distance (or another string distance metric) to check how many changes are required to transform the original word into the corrected word. A high number of changes might signify an incorrect correction.

- First and Last Characters: Check if both the first and last characters are different between the original and corrected word.

- Frequency of Correction: If a correction occurs very frequently, it might be a systematic error rather than a true correction.


In [40]:
from Levenshtein import distance

def check_differences(row):
    original_tag = row['tag']
    corrected_tag = row['spellCheckSuggestDoc']
    if original_tag == None or original_tag == '':
        return False
    
    # Check if the first three letters are different
    if original_tag[:3] != corrected_tag[:3]:
        return True

    # Check if the length difference is significant (e.g., more than 3 characters)
    if abs(len(original_tag) - len(corrected_tag)) > 3:
        return True

    # Check if the edit distance is significant (e.g., more than 3 changes)
    if distance(original_tag, corrected_tag) > 3:
        return True

    # Check if both the first and last characters are different
    if original_tag[0] != corrected_tag[0] and original_tag[-1] != corrected_tag[-1]:
        return True
    


    return False

# Create a mask where the condition is True
mask = tags.apply(check_differences, axis=1)

# Where the mask is True, replace 'spellCheckSuggestDoc' with the original 'tag'
tags.loc[mask, 'spellCheckSuggestDoc'] = tags.loc[mask, 'tag']


In [41]:
tags

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp,is_conjoined,spellCheckSuggestDoc
7,266,301,318,260,s,2015-02-20 22:42:49,False,s
8,267,304,318,115149,action,2015-02-21 15:58:30,False,action
15,274,312,320,2762,twist,2006-04-25 11:33:52,False,twist
16,275,313,320,2959,twist,2006-04-25 11:30:58,False,twist
17,276,314,320,3996,overrated,2006-04-25 11:32:28,False,overrated
...,...,...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25,False,history
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35,False,informatics
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17,False,mathematics
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16,False,image


Correct column now: spellCheckSuggestDoc -> using this column as the 'tag' moving forward

In [42]:
tags['NewTag'] = tags['spellCheckSuggestDoc']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['NewTag'] = tags['spellCheckSuggestDoc']


In [43]:
tags

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp,is_conjoined,spellCheckSuggestDoc,NewTag
7,266,301,318,260,s,2015-02-20 22:42:49,False,s,s
8,267,304,318,115149,action,2015-02-21 15:58:30,False,action,action
15,274,312,320,2762,twist,2006-04-25 11:33:52,False,twist,twist
16,275,313,320,2959,twist,2006-04-25 11:30:58,False,twist,twist
17,276,314,320,3996,overrated,2006-04-25 11:32:28,False,overrated,overrated
...,...,...,...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25,False,history,history
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35,False,informatics,informatics
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17,False,mathematics,mathematics
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16,False,image,image


#### 4) Remove stop words

In [44]:
from spacy.lang.en import STOP_WORDS
  
def remove_stopwords(tag):
    return tag if tag in keep or tag not in STOP_WORDS else ''

tags['NewTag'] = tags['NewTag'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['NewTag'] = tags['NewTag'].apply(remove_stopwords)


In [45]:
tags

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp,is_conjoined,spellCheckSuggestDoc,NewTag
7,266,301,318,260,s,2015-02-20 22:42:49,False,s,s
8,267,304,318,115149,action,2015-02-21 15:58:30,False,action,action
15,274,312,320,2762,twist,2006-04-25 11:33:52,False,twist,twist
16,275,313,320,2959,twist,2006-04-25 11:30:58,False,twist,twist
17,276,314,320,3996,overrated,2006-04-25 11:32:28,False,overrated,overrated
...,...,...,...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25,False,history,history
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35,False,informatics,informatics
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17,False,mathematics,mathematics
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16,False,image,image


### 5) Lemmatisation (WordNet Lemmatizer with POS Tag)
- Removing stop words before lemmatisation may speed up this process
- Need to use POS tags - this is because without POS, lemmatisation doesn't work effectively. E.g leaves certain tags the same

In [46]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import pandas as pd

# Download necessary NLTK data
nltk.download('averaged_perceptron_tagger')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to map NLTK's POS tags to the first character used by WordNetLemmatizer
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

# Function to conditionally lemmatize a single word
def conditional_lemmatize(word, keep):
    if word in keep:
        return word
    else:
        pos = nltk.pos_tag([word])[0][1]  # POS tagging
        wordnet_pos = pos_tagger(pos)     # Map POS tag to first character used by WordNetLemmatizer
        if wordnet_pos is None:
            return word
        else:
            return lemmatizer.lemmatize(word, wordnet_pos)


# Apply the function to the 'NewTag' column
tags['lemmatized_text'] = tags['NewTag'].apply(lambda word: conditional_lemmatize(word, keep))

print(tags)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jiayi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


        Unnamed: 0.1  Unnamed: 0  userId  movieId          tag  \
7                266         301     318      260            s   
8                267         304     318   115149       action   
15               274         312     320     2762        twist   
16               275         313     320     2959        twist   
17               276         314     320     3996    overrated   
...              ...         ...     ...      ...          ...   
109306        390955      464426  138280   116797      history   
109307        390956      464427  138280   116797  informatics   
109308        390957      464428  138280   116797  mathematics   
109310        390959      464430  138280   117871        image   
109311        390960      464432  138280   117871        story   

                  timestamp  is_conjoined spellCheckSuggestDoc       NewTag  \
7       2015-02-20 22:42:49         False                    s            s   
8       2015-02-21 15:58:30         False        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['lemmatized_text'] = tags['NewTag'].apply(lambda word: conditional_lemmatize(word, keep))


In [47]:
tags['un-lemmatised'] = tags['NewTag']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['un-lemmatised'] = tags['NewTag']


Assigning NewTag column to lemmatized_text
- Removing the brackets from lemmatized_text

In [48]:
tags['NewTag'] = tags['lemmatized_text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['NewTag'] = tags['lemmatized_text']


In [49]:
tags

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp,is_conjoined,spellCheckSuggestDoc,NewTag,lemmatized_text,un-lemmatised
7,266,301,318,260,s,2015-02-20 22:42:49,False,s,s,s,s
8,267,304,318,115149,action,2015-02-21 15:58:30,False,action,action,action,action
15,274,312,320,2762,twist,2006-04-25 11:33:52,False,twist,twist,twist,twist
16,275,313,320,2959,twist,2006-04-25 11:30:58,False,twist,twist,twist,twist
17,276,314,320,3996,overrated,2006-04-25 11:32:28,False,overrated,overrate,overrate,overrated
...,...,...,...,...,...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25,False,history,history,history,history
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35,False,informatics,informatics,informatics,informatics
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17,False,mathematics,mathematics,mathematics,mathematics
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16,False,image,image,image,image


In [50]:
len((tags['tag']).drop_duplicates())

3133

In [51]:
len((tags['un-lemmatised']).drop_duplicates())

3084

### Removing single character tags
- If they are NOT in the keep list

In [52]:
# Remove single-character entries unless they are in the 'keep' list
tags['NewTag'] = tags['NewTag'].apply(lambda x: x if (len(x) > 1 or x in keep) else None)
tags['un-lemmatised'] = tags['un-lemmatised'].apply(lambda x: x if (len(x) > 1 or x in keep) else None)
# Drop the rows where 'tag' is None
tags.dropna(subset=['tag'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['NewTag'] = tags['NewTag'].apply(lambda x: x if (len(x) > 1 or x in keep) else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['un-lemmatised'] = tags['un-lemmatised'].apply(lambda x: x if (len(x) > 1 or x in keep) else None)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags.dropna(subset=['tag'], inplace=True)


##### Checking whether tag is valid English word
- if not -> remove, as sentiment, semantic analysis will be valid on actual English dictionary words

### Renaming tag column -> assign NewTag, etc

- Column to access is 'tag' now

In [53]:
tags['NewTag'] = tags['lemmatized_text']
tags['tag'] = tags['NewTag']
tags

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['NewTag'] = tags['lemmatized_text']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags['tag'] = tags['NewTag']


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp,is_conjoined,spellCheckSuggestDoc,NewTag,lemmatized_text,un-lemmatised
7,266,301,318,260,s,2015-02-20 22:42:49,False,s,s,s,
8,267,304,318,115149,action,2015-02-21 15:58:30,False,action,action,action,action
15,274,312,320,2762,twist,2006-04-25 11:33:52,False,twist,twist,twist,twist
16,275,313,320,2959,twist,2006-04-25 11:30:58,False,twist,twist,twist,twist
17,276,314,320,3996,overrate,2006-04-25 11:32:28,False,overrated,overrate,overrate,overrated
...,...,...,...,...,...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25,False,history,history,history,history
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35,False,informatics,informatics,informatics,informatics
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17,False,mathematics,mathematics,mathematics,mathematics
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16,False,image,image,image,image


### Placing dataframe to separate file -> then pipeline to new notebook for Sentiment Analysis models
file: "sentiment_df.csv"
Location: in same repo as this

In [55]:
tags

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,tag,timestamp,is_conjoined,spellCheckSuggestDoc,NewTag,lemmatized_text,un-lemmatised
7,266,301,318,260,s,2015-02-20 22:42:49,False,s,s,s,
8,267,304,318,115149,action,2015-02-21 15:58:30,False,action,action,action,action
15,274,312,320,2762,twist,2006-04-25 11:33:52,False,twist,twist,twist,twist
16,275,313,320,2959,twist,2006-04-25 11:30:58,False,twist,twist,twist,twist
17,276,314,320,3996,overrate,2006-04-25 11:32:28,False,overrated,overrate,overrate,overrated
...,...,...,...,...,...,...,...,...,...,...,...
109306,390955,464426,138280,116797,history,2015-01-30 23:07:25,False,history,history,history,history
109307,390956,464427,138280,116797,informatics,2015-01-30 23:07:35,False,informatics,informatics,informatics,informatics
109308,390957,464428,138280,116797,mathematics,2015-01-30 23:07:17,False,mathematics,mathematics,mathematics,mathematics
109310,390959,464430,138280,117871,image,2015-01-30 23:09:16,False,image,image,image,image


In [56]:
tags = tags.drop(columns={'Unnamed: 0', 'is_conjoined', 'spellCheckSuggestDoc','NewTag',  'lemmatized_text'})

In [57]:
tags

Unnamed: 0,Unnamed: 0.1,userId,movieId,tag,timestamp,un-lemmatised
7,266,318,260,s,2015-02-20 22:42:49,
8,267,318,115149,action,2015-02-21 15:58:30,action
15,274,320,2762,twist,2006-04-25 11:33:52,twist
16,275,320,2959,twist,2006-04-25 11:30:58,twist
17,276,320,3996,overrate,2006-04-25 11:32:28,overrated
...,...,...,...,...,...,...
109306,390955,138280,116797,history,2015-01-30 23:07:25,history
109307,390956,138280,116797,informatics,2015-01-30 23:07:35,informatics
109308,390957,138280,116797,mathematics,2015-01-30 23:07:17,mathematics
109310,390959,138280,117871,image,2015-01-30 23:09:16,image


In [62]:
tags.to_csv("../dataset/sentiment_df.csv",index=False)
tags.to_csv("../dataset/tag_full_preprocessed.csv",index=False)

In [2]:
import pandas as pd
tags = pd.read_csv("../dataset/tag_full_preprocessed.csv")
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50191 entries, 0 to 50190
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     50191 non-null  int64 
 1   Unnamed: 0.1   50191 non-null  int64 
 2   userId         50191 non-null  int64 
 3   movieId        50191 non-null  int64 
 4   tag            49904 non-null  object
 5   timestamp      50191 non-null  object
 6   un-lemmatised  49078 non-null  object
dtypes: int64(4), object(3)
memory usage: 2.7+ MB
