In [3]:
import pandas as pd
import re
import uuid

def parse_movie_data(file_path):
    ##Parsing movies data in txt file and use a  “try”method.And encoding :utf-8 can due with 1-4 characters
    try:
        df = pd.read_csv(file_path, delimiter='\t', encoding='utf-8')
    except:
        df = pd.read_csv(file_path, delimiter='\t', encoding='cp1252') ##except is used to due with error.
    
    movies = []
    
    for idx, row in df.iterrows():
        ## Parsing the data of movies,this a straight force I use.
        movie = {
            'id': int(row['Movie #']) if pd.notna(row['Movie #']) else idx + 1,
            'title': str(row['Title_basic']).strip() if pd.notna(row['Title_basic']) else str(row['Title_TMDB']).strip(),
            'year': int(row['Year_basic']) if pd.notna(row['Year_basic']) else None,
            'duration': parse_duration(row['Duration_basic']),
            'directors': parse_list(row['Directors_basic']),
            'actors': parse_list(row['Actors_basic']),
            'genres': parse_list(row['Genres_basic']),
            'plot': str(row['Plot_Summary']).strip() if pd.notna(row['Plot_Summary']) else str(row['Overview']).strip(),
            'tmdb_title': str(row['Title_TMDB']).strip() if pd.notna(row['Title_TMDB']) else None,
            'release_date': str(row['Release_Date']).strip() if pd.notna(row['Release_Date']) else None,
            'budget': parse_budget(row['Budget']),
            'revenue': parse_budget(row['Revenue']),
            'runtime': parse_runtime(row['Runtime']),
            'rating': float(row['Vote_Average']) if pd.notna(row['Vote_Average']) else None,
            'vote_count': int(row['Vote_Count']) if pd.notna(row['Vote_Count']) else None,
            'overview': str(row['Overview']).strip() if pd.notna(row['Overview']) else None,
            'match_score': float(row['Match_Score']) if pd.notna(row['Match_Score']) else None,
            'match_status': str(row['Match_Status']).strip() if pd.notna(row['Match_Status']) else None
        }
        movies.append(movie)
    
    return movies
## def parse_variable (variable) all use for parsing the data in txt file
def parse_duration(duration):
    ##pd.isna is a function in pandas and motivate to check the data missing or not.
    if pd.isna(duration):
        return None
    try:
        return int(duration)
    except:
        ## Try to extract nums from characters
        match = re.search(r'(\d+)', str(duration))
        return int(match.group(1)) if match else None

def parse_list(list_str, delimiter='|'):
    ##Parsing it 
    if pd.isna(list_str):
        return []
    
    items = []
    for item in str(list_str).split(delimiter):
        item = item.strip()
        if item:
            items.append(item)
    return items

def parse_budget(budget):
    
    if pd.isna(budget):
        return None
    try:
        return int(float(budget))
    except:
        return None

def parse_runtime(runtime):
    
    if pd.isna(runtime):
        return None
    try:
        return int(runtime)
    except:
        return None

def generate_rdf(movies):
    ##Generate RDF format in ttl file
    
    ##Collect all entities
    all_directors = set()
    all_actors = set()
    all_genres = set()
    
    for movie in movies:
        all_directors.update(movie['directors'])
        all_actors.update(movie['actors'])
        all_genres.update(movie['genres'])
    
    # Create URI mapping
    director_uris = {director: f"director_{i+1}" for i, director in enumerate(sorted(all_directors))}
    actor_uris = {actor: f"actor_{i+1}" for i, actor in enumerate(sorted(all_actors))}
    genre_uris = {genre: f"genre_{i+1}" for i, genre in enumerate(sorted(all_genres))}
    
    # Genertate the ttl detail (port numbers,base_URL)
    turtle = """@prefix : <http://www.semanticweb.org/legion/ontologies/2026/0/untitled-ontology-5/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@base <http://www.semanticweb.org/legion/ontologies/2026/0/untitled-ontology-5/> .

"""
    
    ##Generate director entities
    turtle += "\n# ============ Directors ============\n"
    for director, uri in director_uris.items():
        turtle += f""":{uri} a :Director ;
    :name "{escape_string(director)}" .\n"""
    
    ##Actor
    turtle += "\n# ============ Actors ============\n"
    for actor, uri in actor_uris.items():
        turtle += f""":{uri} a :Actor ;
    :name "{escape_string(actor)}" .\n"""
    
    ##Genres
    turtle += "\n# ============ Genres ============\n"
    for genre, uri in genre_uris.items():
        turtle += f""":{uri} a :Genre ;
    :name "{escape_string(genre)}" .\n"""
    
    ##Generate relationships between movie and other entities
    turtle += "\n# ============ Movies ============\n"
    for movie in movies:
        movie_id = movie['id']
        title = movie['title']
        
        # Base information of movies
        turtle += f"\n# Movie: {title}\n"
        turtle += f":movie_{movie_id} a :Movie ;\n"
        turtle += f'    :title "{escape_string(title)}"'
        
        if movie['year']:
            turtle += f" ;\n    :releaseYear {movie['year']}"
        
        # Duration(runtime)
        duration = movie['duration'] or movie['runtime']
        if duration:
            turtle += f" ;\n    :duration {duration}"
        
        if movie['rating']:
            turtle += f" ;\n    :rating {movie['rating']:.1f}"
        
        if movie['plot']:
            turtle += f' ;\n    :plot "{escape_string(movie["plot"])}"'
        
        if movie['overview']:
            turtle += f' ;\n    :plot "{escape_string(movie["overview"])}"'
        
        if movie['budget']:
            turtle += f" ;\n    :budget {movie['budget']}"
        
        if movie['revenue']:
            turtle += f" ;\n    :revenue {movie['revenue']}"
        
        if movie['match_score']:
            turtle += f" ;\n    :matchScore {movie['match_score']}"
        
        if movie['match_status']:
            turtle += f' ;\n    :matchStatus "{escape_string(movie["match_status"])}"'
        
        turtle += " .\n"
        
        ## This following three entities that have inverse attribute.director direct a movie,and a movie directed by a director.
        for director in movie['directors']:
            if director in director_uris:
                turtle += f":movie_{movie_id} :directedBy :{director_uris[director]} .\n"
        
        
        for actor in movie['actors']:
            if actor in actor_uris:
                turtle += f":movie_{movie_id} :hasActor :{actor_uris[actor]} .\n"
        
        
        for genre in movie['genres']:
            if genre in genre_uris:
                turtle += f":movie_{movie_id} :hasGenre :{genre_uris[genre]} .\n"
    
    return turtle

def escape_string(text):
    ##THis is an optimal of the vistually and from (https://ask.csdn.net/questions/8908837),and "import re" is for this step.
    if not isinstance(text, str):
        text = str(text)
    
    
    text = text.replace('"', '\\"')
    
    text = text.replace('\n', ' ').replace('\r', ' ')
    
    text = ' '.join(text.split())
    
    return text

def main():
    ##Parsing data 
    print("Parsing data in time...")
    movies = parse_movie_data('The dataset rematch .txt')
    
    print(f"succeed parse {len(movies)} movies")
    print(f"director amount is: {len(set().union(*[m['directors'] for m in movies]))}")
    print(f"amount of actor is: {len(set().union(*[m['actors'] for m in movies]))}")
    print(f":numbers of genre {len(set().union(*[m['genres'] for m in movies]))}")
    
    ##Generate the RDF data
    print("Generating rdf in ttl right now")
    rdf_content = generate_rdf(movies)
    
    ##Save as ttl 
    output_file = 'movies_rematch.ttl'
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(rdf_content)
    
    print(f"\nRDF.ttl file : {output_file}")
    print("file size:", len(rdf_content), "charecters")
    
    ##express samples 
    print("\n==sample==")
    sample = rdf_content[:2000] + "\n..." if len(rdf_content) > 2000 else rdf_content
    print(sample)

if __name__ == "__main__":
    main()

Parsing data in time...
succeed parse 17 movies
director amount is: 14
amount of actor is: 46
:numbers of genre 11
Generating rdf in ttl right now

RDF.ttl file : movies_rematch.ttl
file size: 18464 charecters

==sample==
@prefix : <http://www.semanticweb.org/legion/ontologies/2026/0/untitled-ontology-5/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@base <http://www.semanticweb.org/legion/ontologies/2026/0/untitled-ontology-5/> .


:director_1 a :Director ;
    :name "Christopher Nolan" .
:director_2 a :Director ;
    :name "David Fincher" .
:director_3 a :Director ;
    :name "Francis Ford Coppola" .
:director_4 a :Director ;
    :name "Frank Capra" .
:director_5 a :Director ;
    :name "Frank Darabont" .
:director_6 a :Director ;
    :name "Jonathan Demme" .
:director_7 a :Director ;
    :name "Lana and Lilly Wac