In [1]:
import pandas as pd
import re

In [2]:
def load_csv(path):
    """
    Loads a csv file and returns a pandas dataframe.
    """
    return pd.read_csv(path, 
                        sep="\t", 
                        header=None,
                        encoding="utf-8",
                        names=["ts", "id", "uname", "name", "comment"],
                        on_bad_lines="skip", # skips wrong formatted lines
                        )

In [3]:
import re

def doc_to_terms(string):
    EMOJI_PATTERN = re.compile(
        r"(["
        r"\U0001F1E0-\U0001F1FF"
        r"\U0001F300-\U0001F5FF"
        r"\U0001F600-\U0001F64F"
        r"\U0001F680-\U0001F6FF"
        r"\U0001F700-\U0001F77F"
        r"\U0001F780-\U0001F7FF"
        r"\U0001F800-\U0001F8FF"
        r"\U0001F900-\U0001F9FF"
        r"\U0001FA00-\U0001FA6F"
        r"\U0001FA70-\U0001FAFF"
        r"\u2702-\u27B0"
        r"])"
    )

    string_processed = re.sub(r"[\[\]\.\(\),\?!\"'@:#]", "", string).lower()
    string_processed = re.sub(EMOJI_PATTERN, "", string_processed)
    string_processed = re.sub(r"newline", " ", string_processed)
    string_processed = re.sub(r"tab", " ", string_processed)

    terms = re.split(r"\s+", string_processed.strip())
    terms = list(set(terms))

    return terms


In [11]:
def index(filename):
    """
    Creates a non-positional inverted index from a csv file.
    """
    # Load the dataset from the specified file
    tweets = load_csv(filename)

    dictionary = {}  
    postings = {}   
    
    for index, row in tweets.iterrows():
        # Normalize and tokenize the comment into unique terms
        terms = doc_to_terms(row["comment"])

        for term in terms:
            if term in dictionary:
                # Update term frequency and add tweet ID to posting set
                count, pointer = dictionary[term]
                count += 1
                postings[pointer].add(row["id"])
                dictionary[term] = (count, pointer)
            else:
                # Create a new posting entry for the term
                pointer = "p" + str(len(postings) + 1)
                dictionary[term] = (1, pointer)
                postings[pointer] = {row["id"]}

    # Convert posting sets to sorted lists
    for pointer in postings:
        postings[pointer] = sorted(postings[pointer])

    return dictionary, postings


In [12]:
dictionary, postings = index("data/tweets.csv")

In [6]:
def query(term : str):
    """
    Returns the posting list of a term.
    """
    if term in dictionary: 
        _, pointer = dictionary[term]
        t_posting = postings[pointer]
    else:
        t_posting = {}

    return t_posting

In [7]:
def query_two(term1, term2 : str):
    """
    Returns the intersection of two posting lists.
    """
    if term1 in dictionary and term2 in dictionary:
        # get the posting lists of the terms
        _, pointer1 = dictionary[term1]
        _, pointer2 = dictionary[term2]
        t_posting1 = postings[pointer1]
        t_posting2 = postings[pointer2]

        # initialize iterators
        iter1 = iter(t_posting1)
        iter2 = iter(t_posting2)

        res = []
        try:
            doc1 = next(iter1)
            doc2 = next(iter2)
            # iterate through the posting lists 
            # until the end of one of them is reached
            while True:
                if doc1 == doc2:
                    res.append(doc1)
                    doc1 = next(iter1)
                    doc2 = next(iter2)
                elif doc1 < doc2:
                    doc1 = next(iter1)
                else:
                    doc2 = next(iter2)
        except StopIteration:
            pass
        return res

In [8]:
# example usage: “show me tweets of people who talk about the side effects of malaria vaccines”

In [13]:
query("reduce")

[959514944340406274,
 959575041984815104,
 959692402897252352,
 960943225950363648,
 962312288895082496,
 967482743352446976,
 968581927103025152,
 968942042767380482,
 968978184438915072,
 971107739257720832,
 975144519481016321,
 975611613645504513,
 976245297922719744,
 976814069666660352,
 976891628135895041,
 977600179607166978,
 977931146658045954,
 978029187364786177,
 978045723701907456,
 979010694732533761,
 979334908157063170,
 981754225544790017,
 983446352284864512,
 983452965313961984,
 984116422174564352,
 984174853845495808,
 984808954885787648,
 986672130233458690,
 986701837213618176,
 987239808811454464,
 989494717954932738,
 991387528207982593,
 991428556583522304,
 993159352772190209,
 994173759237083136,
 994570946630602752,
 995709620365811714,
 996829924462923776,
 997526703588032512,
 998652713645813760,
 998712863899303936,
 1002642808736505856,
 1008860493854003200,
 1013512078257197056,
 1018620460458704901,
 1022625149810630656]