In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
csv_file = 'data/file_information.csv'
plagiarism_df = pd.read_csv(csv_file)

# print out the first few rows of data info
plagiarism_df.head()

Unnamed: 0,File,Task,Category
0,g0pA_taska.txt,a,non
1,g0pA_taskb.txt,b,cut
2,g0pA_taskc.txt,c,light
3,g0pA_taskd.txt,d,heavy
4,g0pA_taske.txt,e,non


In [3]:
# Read in a csv file and return a transformed dataframe
def numerical_dataframe(csv_file='data/file_information.csv'):
    '''Reads in a csv file which is assumed to have `File`, `Category` and `Task` columns.
       This function does two things: 
       1) converts `Category` column values to numerical values 
       2) Adds a new, numerical `Class` label column.
       The `Class` column will label plagiarized answers as 1 and non-plagiarized as 0.
       Source texts have a special label, -1.
       :param csv_file: The directory for the file_information.csv file
       :return: A dataframe with numerical categories and a new `Class` label column'''
    plagiarism_df = pd.read_csv(csv_file)
    cat_dct = {"non": 0, "heavy": 1, "light": 2, "cut": 3, "orig": -1}

    return plagiarism_df.assign(
        Category=lambda x: x["Category"].map(lambda y: cat_dct[y]),
        Class=lambda x: x["Category"].map(lambda y: y if y < 1 else 1),
    )



In [4]:
# informal testing, print out the results of a called function
# create new `transformed_df`
transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')

# check work
# check that all categories of plagiarism have a class label = 1
transformed_df.head(10)

Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0
5,g0pB_taska.txt,a,0,0
6,g0pB_taskb.txt,b,0,0
7,g0pB_taskc.txt,c,3,1
8,g0pB_taskd.txt,d,2,1
9,g0pB_taske.txt,e,1,1


In [5]:
# test cell that creates `transformed_df`, if tests are passed

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""

# importing tests
import problem_unittests as tests

# test numerical_dataframe function
tests.test_numerical_df(numerical_dataframe)

# if above test is passed, create NEW `transformed_df`
transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')

# check work
print('\nExample data: ')
transformed_df.head()

Tests Passed!

Example data: 


Unnamed: 0,File,Task,Category,Class
0,g0pA_taska.txt,a,0,0
1,g0pA_taskb.txt,b,3,1
2,g0pA_taskc.txt,c,2,1
3,g0pA_taskd.txt,d,1,1
4,g0pA_taske.txt,e,0,0


In [6]:
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
import helpers 

# create a text column 
text_df = helpers.create_text_column(transformed_df)
text_df.head()

Unnamed: 0,File,Task,Category,Class,Text
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...
1,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...
2,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...
3,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...
4,g0pA_taske.txt,e,0,0,dynamic programming is an algorithm design tec...


In [7]:
random_seed = 1 # can change; set for reproducibility

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
import helpers

# create new df with Datatype (train, test, orig) column
# pass in `text_df` from above to create a complete dataframe, with all the information you need
complete_df = helpers.train_test_dataframe(text_df, random_seed=random_seed)

# check results
complete_df.head(10)

Unnamed: 0,File,Task,Category,Class,Text,Datatype
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...,train
1,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...,test
2,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...,train
3,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...,train
4,g0pA_taske.txt,e,0,0,dynamic programming is an algorithm design tec...,train
5,g0pB_taska.txt,a,0,0,inheritance is a basic concept in object orien...,train
6,g0pB_taskb.txt,b,0,0,pagerank pr refers to both the concept and the...,train
7,g0pB_taskc.txt,c,3,1,vector space model is an algebraic model for r...,test
8,g0pB_taskd.txt,d,2,1,bayes theorem relates the conditional and marg...,train
9,g0pB_taske.txt,e,1,1,dynamic programming is a method for solving ma...,test


In [76]:
from sklearn.feature_extraction.text import CountVectorizer
import toolz as tz
import re

def get_text(df: pd.DataFrame, filename: str) -> str:
    return df.loc[lambda x: x["File"]==filename, "Text"].iat[0]

#def get_source_task(filename: str) -> str:
#    task = re.search(r'\w*_(\w*)\.txt', filename).group(1)
#    return f"orig_{task}.txt"

def get_source_task(filename: str) -> str:
    task = re.search(r'\w*_(\w*)\.txt', filename)
    if task:
        return f"orig_{task.group(1)}.txt"

def make_n_gram_array(a_text: str, s_text: str, n: int) -> np.ndarray:
    counts = CountVectorizer(analyzer='word', 
                             ngram_range=(n,n), 
                             #stop_words="english",
                            )
    return counts.fit_transform([a_text, s_text]).toarray()

# Calculate the ngram containment for one answer file/source file pair in a df
def calculate_containment(df, n, answer_filename):
    '''Calculates the containment between a given answer text and its associated source text.
       This function creates a count of ngrams (of a size, n) for each text file in our data.
       Then calculates the containment by finding the ngram count for a given answer text, 
       and its associated source text, and calculating the normalized intersection of those counts.
       :param df: A dataframe with columns,
           'File', 'Task', 'Category', 'Class', 'Text', and 'Datatype'
       :param n: An integer that defines the ngram size
       :param answer_filename: A filename for an answer text in the df, ex. 'g0pB_taskd.txt'
       :return: A single containment value that represents the similarity
           between an answer text and its source text.
    '''
    category = df.loc[lambda x: x["File"]==filename, "Category"].iat[0]
    print(f"Category is {category}\n")
    a_text = get_text(df, answer_filename)
    print(a_text)
    s_text = get_text(df, get_source_task(answer_filename))
    print(s_text)
    ngram_array = make_n_gram_array(a_text, s_text, n)
    print(n)
    #intersection_counts = np.where(ngram_array[0] & ngram_array[1])[0].shape[0]
    intersection_counts = np.where(ngram_array[0] & ngram_array[1])[0].shape[0]
    print(f"Intersection count is {intersection_counts}\n")
    total_n_grams_in_a = np.where(ngram_array[0] > 0)[0].shape[0]
    print(f"Total n_grams in a {total_n_grams_in_a}\n")
    print(f"Containment is {intersection_counts / total_n_grams_in_a}\n\n")
    return intersection_counts / total_n_grams_in_a
    

def containment(ngram_array: np.ndarray) -> float:
    ''' Containment is a measure of text similarity. It is the normalized, 
       intersection of ngram word counts in two texts.
       :param ngram_array: an array of ngram counts for an answer and source text.
       :return: a normalized containment value.'''
    
    
    # your code here
    #print(np.where(ngram_array[0] & ngram_array[1]))
    intersection_counts = np.where(ngram_array[0] & ngram_array[1])[0].shape[0]
    #print(intersection_counts)
    total_n_grams_in_a = np.where(ngram_array[0] > 0)[0].shape[0]
    #print(total_n_grams_in_a)
    return intersection_counts / total_n_grams_in_a

def text_to_containment(a_text: str, s_text: str, n: int) -> float:
    ngram_array = make_n_gram_array(a_text, s_text, n)
    return containment(ngram_array)

def add_source_col(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(Source=lambda x: x["File"].map(get_source_task)).merge(
        df[["File", "Text"]], left_on="Source", right_on="File"
    )


def add_n_gram_col(df: pd.DataFrame, n: int) -> pd.DataFrame:
    return df.assign(
        **{
            f"score_{n}": lambda y: y.apply(
                lambda x: text_to_containment(x["Text_x"], x["Text_y"], n), axis=1
            )
        }
    )


def compare_n_gram_performance(df: pd.DataFrame, n: int, group: str) -> pd.DataFrame:
    df = add_n_gram_col(df, n)
    return df.groupby(group)[f"score_{n}"].describe()

In [67]:
with_source_txt = add_source_col(complete_df)

In [85]:
with_1_gram_df = add_n_gram_col(with_source_txt, 1)

In [86]:
test_names = ['g0pA_taska.txt', 'g0pA_taskb.txt', 'g0pA_taskc.txt', 'g0pA_taskd.txt']

In [89]:
with_1_gram_df[lambda x: x["File_x"].isin(test_names)]

Unnamed: 0,File_x,Task,Category,Class,Text_x,Datatype,Source,File_y,Text_y,score_1
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...,0.276596
20,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...,test,orig_taskb.txt,orig_taskb.txt,pagerank is a link analysis algorithm used by ...,0.841121
40,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...,train,orig_taskc.txt,orig_taskc.txt,vector space model or term vector model is an ...,0.781513
60,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...,train,orig_taskd.txt,orig_taskd.txt,in probability theory bayes theorem often call...,0.351351


In [90]:
with_3_gram_df = add_n_gram_col(with_source_txt, 3)

In [91]:
with_3_gram_df[lambda x: x["File_x"].isin(test_names)]

Unnamed: 0,File_x,Task,Category,Class,Text_x,Datatype,Source,File_y,Text_y,score_3
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...,0.009756
20,g0pA_taskb.txt,b,3,1,pagerank is a link analysis algorithm used by ...,test,orig_taskb.txt,orig_taskb.txt,pagerank is a link analysis algorithm used by ...,0.947917
40,g0pA_taskc.txt,c,2,1,the vector space model also called term vector...,train,orig_taskc.txt,orig_taskc.txt,vector space model or term vector model is an ...,0.603774
60,g0pA_taskd.txt,d,1,1,bayes theorem was names after rev thomas bayes...,train,orig_taskd.txt,orig_taskd.txt,in probability theory bayes theorem often call...,0.131868


In [None]:
np.isclose

In [79]:
compare_n_gram_performance(with_source_txt, 1, "Task")

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
a,20.0,0.514271,0.295521,0.178947,0.263753,0.395803,0.748521,1.0
b,20.0,0.406972,0.230728,0.214765,0.254584,0.304252,0.448576,1.0
c,20.0,0.459599,0.226965,0.222222,0.248311,0.417176,0.562055,1.0
d,20.0,0.494748,0.281579,0.160305,0.258989,0.435473,0.680978,1.0
e,20.0,0.562739,0.233146,0.252033,0.337979,0.591336,0.747291,1.0


In [80]:
compare_n_gram_performance(with_source_txt, 3, "Task")

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
a,20.0,0.316408,0.399054,0.0,0.012307,0.065927,0.660502,1.0
b,20.0,0.261039,0.341775,0.0,0.01678,0.047529,0.491438,1.0
c,20.0,0.249282,0.286109,0.0,0.008033,0.15053,0.403391,1.0
d,20.0,0.351034,0.35293,0.0,0.038853,0.229435,0.54323,1.0
e,20.0,0.411282,0.392011,0.0,0.026187,0.362037,0.80593,1.0


In [77]:
compare_n_gram_performance(with_source_txt, 1, "Category")

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
0,38.0,0.256179,0.049903,0.160305,0.228745,0.251016,0.278451,0.365079
1,19.0,0.503479,0.156197,0.318182,0.389836,0.445946,0.573329,0.937984
2,19.0,0.626144,0.172559,0.317647,0.507143,0.641304,0.772436,0.974684
3,19.0,0.661525,0.247571,0.22807,0.445833,0.741935,0.861737,0.992366


In [78]:
compare_n_gram_performance(with_source_txt, 2, "Category")

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
0,38.0,0.065678,0.02702,0.010929,0.047489,0.063988,0.084816,0.116564
1,19.0,0.351883,0.199199,0.074713,0.207527,0.3,0.476807,0.927928
2,19.0,0.547216,0.23745,0.210884,0.339905,0.607143,0.715711,0.984674
3,19.0,0.651294,0.323109,0.036697,0.372955,0.817073,0.931204,0.991228


In [82]:
from sklearn.feature_extraction.text import CountVectorizer
import toolz as tz
import re

def get_text(df: pd.DataFrame, filename: str) -> str:
    return df.loc[lambda x: x["File"]==filename, "Text"].iat[0]

#def get_source_task(filename: str) -> str:
#    task = re.search(r'\w*_(\w*)\.txt', filename).group(1)
#    return f"orig_{task}.txt"

def get_source_task(filename: str) -> str:
    task = re.search(r'\w*_(\w*)\.txt', filename)
    if task:
        return f"orig_{task.group(1)}.txt"

def make_n_gram_array(a_text: str, s_text: str, n: int) -> np.ndarray:
    counts = CountVectorizer(analyzer='word', 
                             ngram_range=(n,n), 
                             #stop_words="english",
                            )
    return counts.fit_transform([a_text, s_text]).toarray()

def containment(ngram_array: np.ndarray) -> float:
    ''' Containment is a measure of text similarity. It is the normalized, 
       intersection of ngram word counts in two texts.
       :param ngram_array: an array of ngram counts for an answer and source text.
       :return: a normalized containment value.'''
    
    
    # your code here
    #print(np.where(ngram_array[0] & ngram_array[1]))
    intersection_counts = np.where(ngram_array[0] & ngram_array[1])[0].shape[0]
    #print(intersection_counts)
    total_n_grams_in_a = np.where(ngram_array[0] > 0)[0].shape[0]
    #print(total_n_grams_in_a)
    return intersection_counts / total_n_grams_in_a

# Calculate the ngram containment for one answer file/source file pair in a df
def calculate_containment(df, n, answer_filename):
    '''Calculates the containment between a given answer text and its associated source text.
       This function creates a count of ngrams (of a size, n) for each text file in our data.
       Then calculates the containment by finding the ngram count for a given answer text, 
       and its associated source text, and calculating the normalized intersection of those counts.
       :param df: A dataframe with columns,
           'File', 'Task', 'Category', 'Class', 'Text', and 'Datatype'
       :param n: An integer that defines the ngram size
       :param answer_filename: A filename for an answer text in the df, ex. 'g0pB_taskd.txt'
       :return: A single containment value that represents the similarity
           between an answer text and its source text.
    '''
    category = df.loc[lambda x: x["File"]==filename, "Category"].iat[0]
    print(f"Category is {category}\n")
    a_text = get_text(df, answer_filename)
    print(a_text)
    s_text = get_text(df, get_source_task(answer_filename))
    print(s_text)
    ngram_array = make_n_gram_array(a_text, s_text, n)
    return containment(ngram_array)
    


def text_to_containment(a_text: str, s_text: str, n: int) -> float:
    ngram_array = make_n_gram_array(a_text, s_text, n)
    return containment(ngram_array)

def add_source_col(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(Source=lambda x: x["File"].map(get_source_task)).merge(
        df[["File", "Text"]], left_on="Source", right_on="File"
    )


def add_n_gram_col(df: pd.DataFrame, n: int) -> pd.DataFrame:
    return df.assign(
        **{
            f"score_{n}": lambda y: y.apply(
                lambda x: text_to_containment(x["Text_x"], x["Text_y"], n), axis=1
            )
        }
    )


def compare_n_gram_performance(df: pd.DataFrame, n: int, group: str) -> pd.DataFrame:
    df = add_n_gram_col(df, n)
    return df.groupby(group)[f"score_{n}"].describe()

In [30]:
with_source_txt = complete_df.assign(
    Source=lambda x: x["File"].map(get_source_task)
).merge(complete_df[["File", "Text"]], left_on="Source", right_on="File")


In [57]:
compare_n_gram_performance(with_source_txt, 2)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
0,38.0,0.01418,0.014079,0.0,0.000988,0.01084,0.019393,0.065217
1,19.0,0.267766,0.218669,0.026178,0.105516,0.219512,0.398795,0.895397
2,19.0,0.473429,0.255113,0.119403,0.220562,0.472603,0.63522,0.981481
3,19.0,0.639966,0.34097,0.0,0.336269,0.744048,0.944044,0.98374


In [59]:
compare_n_gram_performance(with_source_txt, 1)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
0,38.0,0.01418,0.014079,0.0,0.000988,0.01084,0.019393,0.065217
1,19.0,0.267766,0.218669,0.026178,0.105516,0.219512,0.398795,0.895397
2,19.0,0.473429,0.255113,0.119403,0.220562,0.472603,0.63522,0.981481
3,19.0,0.639966,0.34097,0.0,0.336269,0.744048,0.944044,0.98374


In [41]:
with_score_3 = with_source_txt.assign(
    score_3=lambda y: y.apply(
        lambda x: text_to_containment(x["Text_x"], x["Text_y"], 3), axis=1
    )
)


In [43]:
with_score_3.groupby("Category")["score_3"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,5.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
0,38.0,0.01418,0.014079,0.0,0.000988,0.01084,0.019393,0.065217
1,19.0,0.267766,0.218669,0.026178,0.105516,0.219512,0.398795,0.895397
2,19.0,0.473429,0.255113,0.119403,0.220562,0.472603,0.63522,0.981481
3,19.0,0.639966,0.34097,0.0,0.336269,0.744048,0.944044,0.98374


In [75]:
with_source_txt.head()

Unnamed: 0,File_x,Task,Category,Class,Text_x,Datatype,Source,File_y,Text_y
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
1,g0pB_taska.txt,a,0,0,inheritance is a basic concept in object orien...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
2,g0pC_taska.txt,a,1,1,inheritance in object oriented programming is ...,test,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
3,g0pD_taska.txt,a,3,1,inheritance in object oriented programming is ...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
4,g0pE_taska.txt,a,2,1,in object oriented programming inheritance is ...,test,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...


In [37]:
with_source_txt.apply(lambda x: text_to_containment(x["Text_x"], x["Text_y"], 3), axis=1)

0     0.009756
1     0.008439
2     0.104972
3     0.744048
4     0.981481
5     0.000000
6     0.013158
7     0.195238
8     0.003953
9     0.000000
10    0.632653
11    0.428135
12    0.016043
13    0.015544
14    0.940171
15    0.026882
16    0.974265
17    0.207254
18    0.026178
19    1.000000
20    0.947917
21    0.032558
22    0.000000
23    0.547945
24    0.607143
25    0.062500
26    0.010870
27    0.285141
28    0.186441
29    0.020305
        ...   
70    0.219512
71    0.013699
72    0.983740
73    0.233871
74    0.225000
75    0.464706
76    0.895397
77    0.009662
78    0.000000
79    1.000000
80    0.010811
81    0.324074
82    0.666667
83    0.000000
84    0.977778
85    0.027027
86    0.474576
87    0.072165
88    0.010000
89    0.956693
90    0.119403
91    0.023669
92    0.017021
93    0.905512
94    0.510638
95    0.884735
96    0.779661
97    0.400000
98    0.065217
99    1.000000
Length: 100, dtype: float64

In [31]:
with_source_txt

Unnamed: 0,File_x,Task,Category,Class,Text_x,Datatype,Source,File_y,Text_y
0,g0pA_taska.txt,a,0,0,inheritance is a basic concept of object orien...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
1,g0pB_taska.txt,a,0,0,inheritance is a basic concept in object orien...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
2,g0pC_taska.txt,a,1,1,inheritance in object oriented programming is ...,test,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
3,g0pD_taska.txt,a,3,1,inheritance in object oriented programming is ...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
4,g0pE_taska.txt,a,2,1,in object oriented programming inheritance is ...,test,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
5,g1pA_taska.txt,a,0,0,in object oriented programming objects are gro...,test,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
6,g1pB_taska.txt,a,0,0,inheritance is one of the basic concepts of ob...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
7,g1pD_taska.txt,a,2,1,inheritance is a method of forming new classes...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
8,g2pA_taska.txt,a,0,0,inheritance allows programs developed in an ob...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...
9,g2pB_taska.txt,a,0,0,inheritance is an important feature in object...,train,orig_taska.txt,orig_taska.txt,in object oriented programming inheritance is ...


In [83]:
# select a value for n
n = 3

# indices for first few files
test_indices = range(5)

# iterate through files and calculate containment
category_vals = []
containment_vals = []
for i in test_indices:
    # get level of plagiarism for a given file index
    category_vals.append(complete_df.loc[i, 'Category'])
    # calculate containment for given file and n
    filename = complete_df.loc[i, 'File']
    c = calculate_containment(complete_df, n, filename)
    containment_vals.append(c)

# print out result, does it make sense?
print('Original category values: \n', category_vals)
print()
print(str(n)+'-gram containment values: \n', containment_vals)

Category is 0

inheritance is a basic concept of object oriented programming where the basic idea is to create new classes that add extra detail to existing classes this is done by allowing the new classes to reuse the methods and variables of the existing classes and new methods and classes are added to specialise the new class inheritance models the is kind of relationship between entities or objects  for example postgraduates and undergraduates are both kinds of student this kind of relationship can be visualised as a tree structure where student would be the more general root node and both postgraduate and undergraduate would be more specialised extensions of the student node or the child nodes  in this relationship student would be known as the superclass or parent class whereas  postgraduate would be known as the subclass or child class because the postgraduate class extends the student class  inheritance can occur on several layers where if visualised would display a larger tree

In [84]:
# run this test cell
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
# test containment calculation
# params: complete_df from before, and containment function
tests.test_containment(complete_df, calculate_containment)

Category is 0

dynamic programming is an algorithm design technique used for optimisation problems such as minimising or maximising like divide and conquer dynamic programming solves problems by combining solutions to sub problems however unlike divide and conquer sub problems are not always independent as sub problems may share sub sub problems but solution to one sub problem may not affect the solutions to other sub problems of the same problem  there are four steps in dynamic programming  1 characterise structure of an optimal solution  2 define value of optimal solution recursively  3 compute optimal solution values either top down with caching or bottom up in a table  4 construct an optimal solution from computed values  an example of the type of problem for which dynamic programming may be used is given two sequences x x1 xm and y y1 yn find a common subsequence whose length is maximum  dynamic programming reduces computation by solving sub problems in a bottom up fashion and by 

AssertionError: n=1 calculations are incorrect. Double check the intersection calculation.