# Advanced Plagiarism Detection using ML Models

This notebook demonstrates enhanced plagiarism detection techniques using advanced ML models such as BERT and CodeBERT.

## 1. Install Required Dependencies

In [None]:
!pip install Flask>=2.0.0 PyPDF2>=2.0.0 pdfplumber>=0.6.0 scikit-learn>=1.0.0 \
    sentence-transformers>=2.2.0 transformers>=4.0.0 torch>=1.10.0 \
    easyocr>=1.4.0 python-dotenv>=0.19.0 numpy>=1.20.0

## 2. Import Required Libraries

In [None]:
import os
import re
import numpy as np
import PyPDF2
import pdfplumber
import logging
import zipfile
import tempfile
import shutil
import fnmatch
from collections import defaultdict
import difflib

# ML-related imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier

# Advanced models (might require installing)
try:
    from sentence_transformers import SentenceTransformer, util
    SENTENCE_BERT_AVAILABLE = True
    print("Sentence-BERT available")
except ImportError:
    SENTENCE_BERT_AVAILABLE = False
    print("Sentence-BERT not available")

try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    CODEBERT_AVAILABLE = True
    print("CodeBERT dependencies available")
except ImportError:
    CODEBERT_AVAILABLE = False
    print("CodeBERT dependencies not available")
    
try:
    import easyocr
    OCR_AVAILABLE = True
    print("EasyOCR available")
except ImportError:
    OCR_AVAILABLE = False
    print("EasyOCR not available")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## 3. Text Comparison with Sentence-BERT

In [None]:
# Initialize Sentence-BERT model if available
if SENTENCE_BERT_AVAILABLE:
    try:
        sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("Sentence-BERT model loaded successfully")
    except Exception as e:
        print(f"Error loading Sentence-BERT model: {e}")
        SENTENCE_BERT_AVAILABLE = False

In [None]:
def preprocess_text(text):
    """Preprocess text by converting to lowercase, removing special characters, and extra whitespace"""
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_text_similarity_tfidf(text1, text2):
    """Calculate cosine similarity between two text documents using TF-IDF"""
    try:
        # Preprocess the texts
        preprocessed_text1 = preprocess_text(text1)
        preprocessed_text2 = preprocess_text(text2)
        
        # Create TF-IDF vectorizer and transform documents
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([preprocessed_text1, preprocessed_text2])
        
        # Calculate cosine similarity
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return similarity
    except Exception as e:
        print(f"Error calculating text similarity with TF-IDF: {e}")
        return 0.0

def get_text_similarity_bert(text1, text2):
    """Calculate semantic similarity using Sentence-BERT"""
    if not SENTENCE_BERT_AVAILABLE:
        return get_text_similarity_tfidf(text1, text2)
    
    try:
        # Truncate very long texts to avoid memory issues
        text1 = text1[:10000] if len(text1) > 10000 else text1
        text2 = text2[:10000] if len(text2) > 10000 else text2
        
        # Generate embeddings
        emb1 = sentence_model.encode(text1, convert_to_tensor=True)
        emb2 = sentence_model.encode(text2, convert_to_tensor=True)
        
        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(emb1, emb2).item()
        return similarity
    except Exception as e:
        print(f"Error calculating text similarity with BERT: {e}")
        # Fallback to traditional method
        return get_text_similarity_tfidf(text1, text2)

## 4. PDF Text Extraction with EasyOCR

In [None]:
# Initialize EasyOCR if available
if OCR_AVAILABLE:
    try:
        ocr_reader = easyocr.Reader(['en'])  # Initialize once (slow)
        print("EasyOCR initialized successfully")
    except Exception as e:
        print(f"Error initializing EasyOCR: {e}")
        OCR_AVAILABLE = False

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text content from a PDF file using PyPDF2 and pdfplumber as fallback"""
    try:
        # First try with PyPDF2
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                page_text = pdf_reader.pages[page_num].extract_text() or ""
                text += page_text + "\n"
        
        # If PyPDF2 fails to extract meaningful text, try pdfplumber
        if text.strip() == "":
            text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text() or ""
                    text += page_text + "\n"
        
        # If both text extraction methods fail, it might be a scanned PDF, try OCR
        if text.strip() == "" and OCR_AVAILABLE:
            try:
                text = extract_text_with_easyocr(pdf_path)
                print(f"Used OCR to extract text from {pdf_path}")
            except Exception as e:
                print(f"OCR extraction failed: {e}")
                
        return text
    except Exception as e:
        print(f"Error extracting text from PDF {pdf_path}: {e}")
        return ""

def extract_text_with_easyocr(file_path):
    """Extract text from scanned PDFs/images using EasyOCR"""
    if not OCR_AVAILABLE:
        return ""
        
    try:
        results = ocr_reader.readtext(file_path, paragraph=True)
        return "\n".join([res[1] for res in results])
    except Exception as e:
        print(f"EasyOCR failed: {e}")
        return ""

## 5. Code Comparison with CodeBERT

In [None]:
# Initialize CodeBERT model if available
if CODEBERT_AVAILABLE:
    try:
        code_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
        code_model = AutoModel.from_pretrained("microsoft/codebert-base")
        print("CodeBERT model loaded successfully")
    except Exception as e:
        print(f"Error loading CodeBERT model: {e}")
        CODEBERT_AVAILABLE = False

In [None]:
def preprocess_code(code, language):
    """Preprocess code for comparison"""
    # Remove comments based on language
    if language == 'python':
        # Remove Python-style comments
        code = re.sub(r'#.*', '', code)
        # Remove docstrings
        code = re.sub(r'"""[\s\S]*?"""', '', code)
        code = re.sub(r"'''[\s\S]*?'''", '', code)
    elif language in ['java', 'javascript', 'c', 'cpp', 'csharp']:
        # Remove C-style comments
        code = re.sub(r'//.*', '', code)
        code = re.sub(r'/\*[\s\S]*?\*/', '', code)
    
    # Remove excess whitespace
    code = re.sub(r'\s+', ' ', code).strip()
    
    return code

def embed_code(code, max_length=512):
    """Generate CodeBERT embeddings for code snippets"""
    if not CODEBERT_AVAILABLE:
        raise ValueError("CodeBERT model not available")
        
    try:
        inputs = code_tokenizer(code, return_tensors="pt", truncation=True, max_length=max_length)
        with torch.no_grad():
            outputs = code_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    except Exception as e:
        print(f"Error generating CodeBERT embeddings: {e}")
        raise

## 6. Stylometry Features Extraction

In [None]:
def extract_stylometric_features(text):
    """Extract writing style features"""
    # Handle empty text
    if not text or len(text.strip()) == 0:
        return {
            'avg_word_length': 0,
            'punctuation_density': 0,
            'avg_sentence_length': 0,
            'function_word_ratio': 0,
            'uppercase_ratio': 0
        }
    
    # Tokenize
    words = re.findall(r'\b\w+\b', text.lower())
    sentences = [s.strip() for s in re.split(r'[.!?]', text) if s.strip()]
    
    # Function words (common words that don't carry strong meaning)
    function_words = {
        'the', 'a', 'an', 'and', 'or', 'but', 'if', 'while', 'of', 'at',
        'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
        'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
        'down', 'in', 'out', 'on', 'off', 'over', 'under', 'he', 'she', 'it',
        'they', 'we', 'who', 'what', 'where', 'when', 'why', 'how'
    }
    
    # Calculate metrics
    total_chars = sum(len(w) for w in words) if words else 0
    total_punctuation = sum(1 for c in text if c in ',.;:!?"()')
    function_word_count = sum(1 for w in words if w in function_words)
    uppercase_count = sum(1 for c in text if c.isupper())
    
    avg_word_length = total_chars / len(words) if words else 0
    punctuation_density = total_punctuation / len(text) if text else 0
    avg_sentence_length = len(words) / len(sentences) if sentences else 0
    function_word_ratio = function_word_count / len(words) if words else 0
    uppercase_ratio = uppercase_count / len(text) if text else 0
    
    return {
        'avg_word_length': avg_word_length,
        'punctuation_density': punctuation_density,
        'avg_sentence_length': avg_sentence_length,
        'function_word_ratio': function_word_ratio,
        'uppercase_ratio': uppercase_ratio
    }

def get_stylometric_similarity(text1, text2):
    """Calculate similarity between stylometric features of two texts"""
    features1 = extract_stylometric_features(text1)
    features2 = extract_stylometric_features(text2)
    
    # Calculate normalized Euclidean distance between feature vectors
    squared_diff_sum = 0
    for key in features1:
        # Skip if feature is 0 in both texts
        if features1[key] == 0 and features2[key] == 0:
            continue
            
        max_val = max(abs(features1[key]), abs(features2[key]))
        if max_val > 0:  # Avoid division by zero
            norm1 = features1[key] / max_val
            norm2 = features2[key] / max_val
            squared_diff_sum += (norm1 - norm2) ** 2
    
    # Convert distance to similarity (1 = identical, 0 = completely different)
    distance = np.sqrt(squared_diff_sum)
    similarity = 1 / (1 + distance)  # Transform using sigmoid-like function
    
    return similarity

## 7. Finding Matching Sections

In [None]:
def find_matching_sections(text1, text2, min_length=40, max_sections=10):
    """Find and return the matching sections between two texts"""
    try:
        # Split texts into sentences
        sentences1 = re.split(r'(?<=[.!?])\s+', text1)
        sentences2 = re.split(r'(?<=[.!?])\s+', text2)
        
        # Find matching sections using difflib
        matcher = difflib.SequenceMatcher(None, sentences1, sentences2)
        matching_blocks = matcher.get_matching_blocks()
        
        matched_sections = []
        for match in matching_blocks:
            i, j, size = match
            if size > 0:
                # Extract matching text sections
                matched_text1 = ' '.join(sentences1[i:i+size])
                matched_text2 = ' '.join(sentences2[j:j+size])
                
                # Only include if they're substantial enough
                if len(matched_text1) >= min_length:
                    # Calculate similarity for this section
                    try:
                        if SENTENCE_BERT_AVAILABLE:
                            section_similarity = get_text_similarity_bert(matched_text1, matched_text2)
                        else:
                            section_similarity = get_text_similarity_tfidf(matched_text1, matched_text2)
                    except Exception:
                        section_similarity = get_text_similarity_tfidf(matched_text1, matched_text2)
                    
                    matched_sections.append({
                        'file1_text': matched_text1,
                        'file2_text': matched_text2,
                        'similarity': section_similarity
                    })
        
        # Sort by similarity (highest first) and limit the number of sections
        matched_sections.sort(key=lambda x: x['similarity'], reverse=True)
        return matched_sections[:max_sections]
    except Exception as e:
        print(f"Error finding matching sections: {e}")
        return []

## 8. Combined Plagiarism Detection Function

In [None]:
def detect_plagiarism(file1_path, file2_path, comparison_type, use_advanced_models=True):
    """Main function to detect plagiarism between different file types"""
    try:
        results = {}
        
        # Text comparison
        if comparison_type == 'text_text':
            # Read files
            with open(file1_path, 'r', encoding='utf-8', errors='ignore') as f:
                text1 = f.read()
            with open(file2_path, 'r', encoding='utf-8', errors='ignore') as f:
                text2 = f.read()
            
            # Calculate similarity using BERT if available
            if use_advanced_models and SENTENCE_BERT_AVAILABLE:
                semantic_similarity = get_text_similarity_bert(text1, text2)
            else:
                semantic_similarity = get_text_similarity_tfidf(text1, text2)
            
            # Get stylometric similarity
            style_similarity = get_stylometric_similarity(text1, text2)
            
            # Combine similarities (80% semantic, 20% style)
            combined_similarity = 0.8 * semantic_similarity + 0.2 * style_similarity
            
            # Find matching sections
            matched_sections = find_matching_sections(text1, text2)
            
            results = {
                'similarity': combined_similarity,
                'semantic_similarity': semantic_similarity,
                'style_similarity': style_similarity,
                'matched_sections': matched_sections
            }
            
        # PDF comparison
        elif comparison_type == 'pdf_pdf':
            # Extract text from PDFs
            text1 = extract_text_from_pdf(file1_path)
            text2 = extract_text_from_pdf(file2_path)
            
            # If extraction failed, return zero similarity
            if not text1 or not text2:
                return {
                    'similarity': 0.0,
                    'matched_sections': [],
                    'error': 'Text extraction from PDF failed'
                }
            
            # Calculate similarity using BERT if available
            if use_advanced_models and SENTENCE_BERT_AVAILABLE:
                similarity = get_text_similarity_bert(text1, text2)
            else:
                similarity = get_text_similarity_tfidf(text1, text2)
                
            # Find matching sections
            matched_sections = find_matching_sections(text1, text2)
            
            results = {
                'similarity': similarity,
                'matched_sections': matched_sections
            }
            
        # Text vs PDF comparison
        elif comparison_type == 'text_pdf':
            # Read text file
            with open(file1_path, 'r', encoding='utf-8', errors='ignore') as f:
                text1 = f.read()
                
            # Extract text from PDF
            text2 = extract_text_from_pdf(file2_path)
            
            # If extraction failed, return zero similarity
            if not text2:
                return {
                    'similarity': 0.0,
                    'matched_sections': [],
                    'error': 'Text extraction from PDF failed'
                }
            
            # Calculate similarity using BERT if available
            if use_advanced_models and SENTENCE_BERT_AVAILABLE:
                similarity = get_text_similarity_bert(text1, text2)
            else:
                similarity = get_text_similarity_tfidf(text1, text2)
                
            # Find matching sections
            matched_sections = find_matching_sections(text1, text2)
            
            results = {
                'similarity': similarity,
                'matched_sections': matched_sections
            }
            
        # GitHub repositories comparison
        elif comparison_type == 'github_github':
            return compare_github_repos(file1_path, file2_path, use_advanced_models)
        
        else:
            raise ValueError(f"Invalid comparison type: {comparison_type}")
        
        return results
        
    except Exception as e:
        print(f"Error in plagiarism detection: {e}")
        return {
            'error': str(e),
            'similarity': 0.0,
            'matched_sections': []
        }

## 9. GitHub Repositories Comparison

In [None]:
def extract_zip(zip_path, extract_to=None):
    """Extract a ZIP file to a temporary directory"""
    try:
        if extract_to is None:
            extract_to = tempfile.mkdtemp()
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        
        return extract_to
    except Exception as e:
        print(f"Error extracting ZIP file {zip_path}: {e}")
        return None

def detect_language(filepath):
    """Detect programming language based on file extension"""
    extension_map = {
        '.py': 'python',
        '.java': 'java',
        '.js': 'javascript',
        '.html': 'html',
        '.css': 'css',
        '.c': 'c',
        '.cpp': 'cpp',
        '.h': 'c',
        '.hpp': 'cpp',
        '.cs': 'csharp',
        '.php': 'php',
        '.rb': 'ruby',
        '.go': 'go',
        '.ts': 'typescript'
    }
    _, ext = os.path.splitext(filepath)
    return extension_map.get(ext.lower(), None)

def should_ignore_file(filepath):
    """Check if a file should be ignored in the comparison"""
    ignore_patterns = [
        '*.git*',
        '*.DS_Store',
        '*__pycache__*',
        '*.pyc',
        'node_modules/*',
        'venv/*',
        '*.jar',
        '*.class'
    ]
    
    for pattern in ignore_patterns:
        if fnmatch.fnmatch(filepath, pattern):
            return True
    return False

def compare_code_files(file1_path, file2_path, use_codebert=True):
    """Compare two source code files"""
    try:
        # Detect language
        file1_language = detect_language(file1_path)
        file2_language = detect_language(file2_path)
        
        # If languages don't match, similarity is likely low
        if file1_language != file2_language:
            return 0.1, []
        
        # Read file contents
        with open(file1_path, 'r', encoding='utf-8', errors='ignore') as f:
            code1 = f.read()
        
        with open(file2_path, 'r', encoding='utf-8', errors='ignore') as f:
            code2 = f.read()
        
        # Preprocess code
        processed_code1 = preprocess_code(code1, file1_language)
        processed_code2 = preprocess_code(code2, file2_language)
        
        # Calculate similarity using CodeBERT if available
        if use_codebert and CODEBERT_AVAILABLE:
            try:
                # Generate embeddings
                emb1 = embed_code(processed_code1)
                emb2 = embed_code(processed_code2)
                
                # Calculate cosine similarity
                similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
            except Exception as e:
                print(f"CodeBERT comparison failed: {e}")
                if SENTENCE_BERT_AVAILABLE:
                    similarity = get_text_similarity_bert(processed_code1, processed_code2)
                else:
                    similarity = get_text_similarity_tfidf(processed_code1, processed_code2)
        elif SENTENCE_BERT_AVAILABLE:
            similarity = get_text_similarity_bert(processed_code1, processed_code2)
        else:
            similarity = get_text_similarity_tfidf(processed_code1, processed_code2)
        
        # Find matching sections if similarity is significant
        matched_sections = []
        if similarity > 0.3:
            matched_sections = find_matching_sections(
                code1, code2, min_length=20, max_sections=5
            )
        
        return similarity, matched_sections
    except Exception as e:
        print(f"Error comparing code files: {e}")
        return 0.0, []

def compare_github_repos(repo1_zip, repo2_zip, use_advanced_models=True):
    """Compare two GitHub repositories"""
    temp_dirs = []
    try:
        # Extract ZIPs
        repo1_dir = extract_zip(repo1_zip)
        repo2_dir = extract_zip(repo2_zip)
        
        if not repo1_dir or not repo2_dir:
            return {
                'similarity': 0.0,
                'matched_sections': [],
                'error': 'Failed to extract ZIP files'
            }
        
        temp_dirs.extend([repo1_dir, repo2_dir])
        
        # Get all code files recursively
        repo1_files = []
        for root, _, files in os.walk(repo1_dir):
            for file in files:
                filepath = os.path.join(root, file)
                rel_path = os.path.relpath(filepath, repo1_dir)
                if not should_ignore_file(rel_path) and detect_language(filepath):
                    repo1_files.append(filepath)
        
        repo2_files = []
        for root, _, files in os.walk(repo2_dir):
            for file in files:
                filepath = os.path.join(root, file)
                rel_path = os.path.relpath(filepath, repo2_dir)
                if not should_ignore_file(rel_path) and detect_language(filepath):
                    repo2_files.append(filepath)
        
        if not repo1_files or not repo2_files:
            return {
                'similarity': 0.0,
                'matched_sections': [],
                'error': 'No source code files found in repositories'
            }
        
        # Compare files
        file_similarities = []
        best_matches = []
        
        # Limit the number of comparisons
        max_comparisons = min(len(repo1_files) * len(repo2_files), 100)
        comparisons_done = 0
        
        for file1 in repo1_files:
            file1_rel = os.path.relpath(file1, repo1_dir)
            file1_lang = detect_language(file1)
            
            for file2 in repo2_files:
                file2_rel = os.path.relpath(file2, repo2_dir)
                file2_lang = detect_language(file2)
                
                # Skip if languages don't match
                if file1_lang != file2_lang:
                    continue
                    
                # Count comparisons
                comparisons_done += 1
                if comparisons_done > max_comparisons:
                    break
                
                # Compare files
                similarity, sections = compare_code_files(file1, file2, use_advanced_models)
                
                if similarity > 0.0:
                    file_similarities.append(similarity)
                    
                    # Only keep high similarity matches
                    if similarity > 0.7:
                        best_matches.append({
                            'file1': file1_rel,
                            'file2': file2_rel,
                            'similarity': similarity,
                            'matched_sections': sections
                        })
            
            if comparisons_done > max_comparisons:
                break
        
        # Calculate overall similarity
        if not file_similarities:
            return {
                'similarity': 0.0,
                'matched_sections': [],
                'message': 'No similar files found'
            }
        
        overall_similarity = sum(file_similarities) / len(file_similarities)
        
        # Prepare matched sections
        matched_sections = []
        for match in sorted(best_matches, key=lambda x: x['similarity'], reverse=True)[:5]:
            for section in match['matched_sections']:
                matched_sections.append({
                    'file1_text': f"File: {match['file1']}\n{section['file1_text']}",
                    'file2_text': f"File: {match['file2']}\n{section['file2_text']}",
                    'similarity': section['similarity']
                })
        
        return {
            'similarity': overall_similarity,
            'matched_sections': matched_sections,
            'matching_files': len(best_matches),
            'total_files_compared': comparisons_done
        }
    
    except Exception as e:
        print(f"Error comparing GitHub repositories: {e}")
        return {
            'error': str(e),
            'similarity': 0.0,
            'matched_sections': []
        }
    
    finally:
        # Clean up temp directories
        for temp_dir in temp_dirs:
            if temp_dir and os.path.exists(temp_dir):
                shutil.rmtree(temp_dir, ignore_errors=True)

## 10. Testing and Examples

In [None]:
def create_test_files():
    """Create sample files for testing"""
    os.makedirs('test_files', exist_ok=True)
    
    # Create original text
    original_text = """
    Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence 
    displayed by animals including humans. AI research has been defined as the field of study of intelligent agents, 
    which refers to any system that perceives its environment and takes actions that maximize its chance of achieving 
    its goals. The term "artificial intelligence" had previously been used to describe machines that mimic and display 
    human cognitive skills that are associated with the human mind, such as learning and problem-solving. This definition 
    has since been rejected by major AI researchers who now describe AI in terms of rationality and acting rationally, 
    which does not limit how intelligence can be articulated.
    
    AI applications include advanced web search engines, recommendation systems, content creation, speech recognition, 
    machine translation, computer vision, and autonomous robots. Natural language processing, machine learning and deep learning 
    are subfields of AI.
    """
    
    # Create similar/plagiarized text
    plagiarized_text = """
    Artificial intelligence, or AI, is a form of intelligence demonstrated by machines, as opposed to the natural 
    intelligence displayed by humans and animals. Research in AI has been defined as the study of intelligent agents, 
    which are systems that observe their surroundings and perform actions that increase their chances of achieving 
    their objectives. The phrase "artificial intelligence" was previously used to describe machines that imitate and exhibit 
    human cognitive abilities associated with the human mind, such as learning and problem-solving. Major AI researchers 
    have since rejected this definition and now describe AI in terms of rationality and acting rationally.
    
    Applications of AI include sophisticated search engines, recommendation systems, autonomous content creation, 
    speech recognition technology, translation services, computer vision systems, and self-governing robots. 
    NLP, machine learning and deep learning are important subfields of artificial intelligence.
    """
    
    # Create completely different text
    different_text = """
    Climate change is the long-term alteration of temperature and typical weather patterns in a place. 
    Climate change could refer to a particular location or the planet as a whole. Climate change may cause 
    weather patterns to be less predictable. These unexpected weather patterns can make it difficult to 
    maintain and grow crops in regions that rely on farming because expected temperature and rainfall levels 
    can no longer be relied on. Wildlife populations can also be impacted by climate change. Some species 
    are migrating to higher elevations or latitudes to find suitable habitat as their home regions grow warmer.
    
    The greenhouse effect is the way in which heat is trapped close to Earth's surface by greenhouse gases. 
    These heat-trapping gases can be thought of as a blanket wrapped around Earth, keeping the planet toasty warm. 
    Without them, Earth would be a frozen world. But in recent decades, humans have added more greenhouse gases, 
    making the blanket thicker and stronger. The result is global warming and climate change.
    """
    
    # Create Python code samples
    python_code1 = """
    def factorial(n):
        """Calculate factorial using recursion"""
        if n == 0 or n == 1:
            return 1
        else:
            return n * factorial(n-1)
            
    def main():
        print("Factorial calculator")
        num = int(input("Enter a number: "))
        result = factorial(num)
        print(f"The factorial of {num} is {result}")
        
    if __name__ == "__main__":
        main()
    """
    
    # Similar Python code with variable name changes and comments
    python_code2 = """
    # Function to calculate factorial
    def calculate_factorial(number):
        # Base case
        if number <= 1:
            return 1
        # Recursive case
        else:
            return number * calculate_factorial(number-1)
            
    # Main program
    def run_program():
        print("Welcome to Factorial Calculator")
        user_input = int(input("Please enter a number: "))
        answer = calculate_factorial(user_input)
        print(f"The factorial of {user_input} is {answer}")
        
    # Entry point
    if __name__ == "__main__":
        run_program()
    """
    
    # Completely different Python code
    python_code3 = """
    class BankAccount:
        def __init__(self, owner, balance=0):
            self.owner = owner
            self.balance = balance
            
        def deposit(self, amount):
            if amount > 0:
                self.balance += amount
                return True
            return False
            
        def withdraw(self, amount):
            if 0 < amount <= self.balance:
                self.balance -= amount
                return True
            return False
            
        def __str__(self):
            return f"Account owner: {self.owner}\nBalance: ${self.balance}"
            
    # Test the BankAccount class
    if __name__ == "__main__":
        account = BankAccount("John Doe", 1000)
        print(account)
        account.deposit(500)
        account.withdraw(200)
        print(account)
    """
    
    # Save text files
    with open('test_files/original.txt', 'w') as f:
        f.write(original_text)
        
    with open('test_files/plagiarized.txt', 'w') as f:
        f.write(plagiarized_text)
        
    with open('test_files/different.txt', 'w') as f:
        f.write(different_text)
    
    # Save Python files
    with open('test_files/code1.py', 'w') as f:
        f.write(python_code1)
        
    with open('test_files/code2.py', 'w') as f:
        f.write(python_code2)
        
    with open('test_files/code3.py', 'w') as f:
        f.write(python_code3)
    
    print("Test files created in 'test_files' directory")
    return True

In [None]:
# Create test files
create_test_files()

In [None]:
# Test text comparison
print("Testing text comparison:")
result_text = detect_plagiarism('test_files/original.txt', 'test_files/plagiarized.txt', 'text_text')
print(f"Similarity between original and plagiarized: {result_text['similarity']*100:.2f}%")
if 'semantic_similarity' in result_text:
    print(f"Semantic similarity: {result_text['semantic_similarity']*100:.2f}%")
if 'style_similarity' in result_text:
    print(f"Style similarity: {result_text['style_similarity']*100:.2f}%")

result_diff = detect_plagiarism('test_files/original.txt', 'test_files/different.txt', 'text_text')
print(f"Similarity between original and different text: {result_diff['similarity']*100:.2f}%")

In [None]:
# Test code comparison (with basic TF-IDF if CodeBERT not available)
if os.path.exists('test_files/code1.py') and os.path.exists('test_files/code2.py'):
    print("\nTesting code comparison:")
    
    # Create temporary zip files for GitHub comparison simulation
    import zipfile
    with zipfile.ZipFile('test_files/repo1.zip', 'w') as zipf:
        zipf.write('test_files/code1.py', arcname='factorial.py')
        
    with zipfile.ZipFile('test_files/repo2.zip', 'w') as zipf:
        zipf.write('test_files/code2.py', arcname='calc_factorial.py')
        
    with zipfile.ZipFile('test_files/repo3.zip', 'w') as zipf:
        zipf.write('test_files/code3.py', arcname='bank_account.py')
    
    # Compare similar code repos
    result_code = compare_github_repos('test_files/repo1.zip', 'test_files/repo2.zip')
    print(f"Similarity between similar code repos: {result_code['similarity']*100:.2f}%")
    print(f"Matching files: {result_code.get('matching_files', 'N/A')}")
    
    # Compare different code repos
    result_diff_code = compare_github_repos('test_files/repo1.zip', 'test_files/repo3.zip')
    print(f"Similarity between different code repos: {result_diff_code['similarity']*100:.2f}%")

## 11. Display Matching Sections

In [None]:
def display_matched_sections(result):
    """Display the matched sections between the files in a readable format"""
    if 'matched_sections' not in result or len(result['matched_sections']) == 0:
        print("No significant matching sections found.")
        return
    
    print(f"Found {len(result['matched_sections'])} significant matching sections:")
    print("-" * 80)
    
    for i, section in enumerate(result['matched_sections']):
        print(f"Match #{i+1} (Similarity: {section['similarity']*100:.2f}%)")
        print("\nFile 1:")
        print(section['file1_text'])
        print("\nFile 2:")
        print(section['file2_text'])
        print("-" * 80)

In [None]:
# Display matched sections for text comparison
print("Matched sections for similar texts:")
try:
    display_matched_sections(result_text)
except NameError:
    print("Run the text comparison test first")

In [None]:
# Display matched sections for code comparison
print("Matched sections for similar code:")
try:
    display_matched_sections(result_code)
except NameError:
    print("Run the code comparison test first")

## 12. Conclusion and Advanced Features Summary

The advanced plagiarism detection system implements several enhancements over traditional plagiarism checkers:

1. **Semantic Understanding** with Sentence-BERT
   - Captures meaning beyond exact word matches
   - Better at detecting paraphrased content
   - More accurate similarity scores for related content

2. **OCR for Scanned Documents** with EasyOCR
   - Can process scanned PDFs that traditional text extractors can't handle
   - Works with image-based content

3. **Code Understanding** with CodeBERT
   - Specialized for programming languages
   - Can detect code similarities despite variable renaming and formatting changes
   - Language-aware preprocessing

4. **Stylometric Analysis**
   - Detects writing style similarities
   - Helps identify content from the same author
   - Useful for detecting ghostwriting and unauthorized collaboration

5. **Graceful Degradation**
   - Falls back to simpler methods when advanced models aren't available
   - Ensures functionality in environments with limited resources

These advanced features provide more accurate and comprehensive plagiarism detection across different content types and scenarios.